Spark JavaPairdd中的空指针异常:(javaPairdd.scala:1028)
我得到了这个例外 Spark版本:2.0.2Spark JavaPairdd中的空指针异常:(javaPairdd.scala:1028),java,scala,Java,Scala,我得到了这个例外 Spark版本:2.0.2 17/05/22 13:47:30 ERROR Executor: Exception in task 0.3 in stage 28.0 (TID 33) java.lang.NullPointerException at com.baesystems.ai.analytics.util.RDDUtil.decideBin(RDDUtil.java:47) at com.baesystems.ai.analytics.uti
17/05/22 13:47:30 ERROR Executor: Exception in task 0.3 in stage 28.0 (TID 33)
java.lang.NullPointerException
at com.baesystems.ai.analytics.util.RDDUtil.decideBin(RDDUtil.java:47)
at com.baesystems.ai.analytics.util.RDDUtil.access$400(RDDUtil.java:19)
at com.baesystems.ai.analytics.util.RDDUtil$1.call(RDDUtil.java:129)
at com.baesystems.ai.analytics.util.RDDUtil$1.call(RDDUtil.java:102)
at org.apache.spark.api.java.JavaPairRDD$$anonfun$toScalaFunction$1.apply(JavaPairRDD.scala:1028)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1765)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1134)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1134)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
17/05/22 13:47:30 INFO CoarseGrainedExecutorBackend: Driver commanded a shutdown
17/05/22 13:47:30 INFO MemoryStore: MemoryStore cleared
ver commanded a shutdown
请告诉我这里发生了什么事。这段代码在我的IDE(Intellj)中运行良好;只有当我尝试在Spark Cluster上以独立模式运行此代码时,才会出现此问题
以下是实际代码:
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.rdd.RDD;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class RDDUtil implements Serializable
{
/**
*
*/
private static final long serialVersionUID = 1914105980520925932L;
private static final Logger log = LoggerFactory.getLogger(RDDUtil.class);
public static Map<String, java.util.HashMap<String, Integer>> histoMap = new java.util.HashMap<String, java.util.HashMap<String, Integer>>();
private static HashMap<String, Integer> histogram0;
private static java.util.HashMap<String, Integer> histogram1;
private static java.util.HashMap<String, Integer> histogramBase;
private static int minValue=0;
private static int maxValue=0;
public static int getMinValue() {
return minValue;
}
public static int getMaxValue() {
return maxValue;
}
private static void decideBin(Double label, Double bin)
{
int vCount=0;
log.error("this value of bin is {} and the label is {}",bin,label);
histogramBase.put(bin.toString(), 0);
if(label==0.0) {
assignZero(histogram1, bin.toString());
if(!checkIfPresent(histogram0,bin.toString())) {
vCount++;
histogram0.put(bin.toString(),vCount);
}
}
else {
assignZero(histogram0, bin.toString());
if(!checkIfPresent(histogram1,bin.toString())) {
vCount++;
histogram1.put(bin.toString(),vCount);
}
}
}
private static boolean checkIfPresent(java.util.HashMap<String, Integer> histogram, String bin)
{
if(histogram.containsKey(bin)) {
int value = histogram.get(bin);
value++;
histogram.put(bin, value);
return true;
}
else return false;
}
private static void assignZero(java.util.HashMap<String, Integer> histogram, String bin)
{
if(!histogram.containsKey(bin)) {
histogram.put(bin, 0);
}
}
private static void calculateMin(Double bin)
{
int tempValue = bin.intValue();
if(minValue>tempValue) minValue=tempValue;
}
private static void calculateMax(Double bin)
{
int tempValue = bin.intValue();
if(tempValue>maxValue) maxValue=tempValue;
}
private static JavaRDD<Map<String, HashMap<String, Integer>>> getJavaRDDMap(RDD<LabeledPoint> rdd,int i)
{
long val = rdd.count();
return rdd.toJavaRDD().map(new Function<LabeledPoint,Map<String, HashMap<String, Integer>>>() {
private static final long serialVersionUID = -7619643561853615982L;
@Override
public Map<String, HashMap<String, Integer>> call(LabeledPoint p) {
Double label = (Double)p.productElement(0);
Vector v = (Vector)p.productElement(1);
p.features();
Double bin =0.0;
int vSize =p.features().size();
if(i<vSize) {
bin = v.apply(i);
}
else {
bin = v.apply(vSize-1);
}
minValue=bin.intValue();
calculateMin(bin);
calculateMax(bin);
log.error("this value of bin is {} and the iteration is {}",bin , i);
decideBin(label,bin);
histoMap.put("0.0", histogram0);
histoMap.put("1.0", histogram1);
histoMap.put("@@@BASE@@@", histogramBase);
return histoMap;
}
});
}
public static JavaRDD<Map<String, HashMap<String, Integer>>> computeHistogram(RDD<LabeledPoint> Data, int i)
{
histogram0 = new java.util.HashMap<String, Integer>();
histogram1 = new java.util.HashMap<String, Integer>();
histogramBase = new java.util.HashMap<String, Integer>();
maxValue=0;
JavaRDD<Map<String, HashMap<String, Integer>>> jRdd = getJavaRDDMap(Data,i);
return jRdd;
}
}
import java.io.Serializable;
导入java.util.HashMap;
导入java.util.Map;
导入org.apache.spark.api.java.JavaRDD;
导入org.apache.spark.api.java.function.function;
导入org.apache.spark.mllib.linalg.Vector;
导入org.apache.spark.mllib.regression.LabeledPoint;
导入org.apache.spark.rdd.rdd;
导入org.slf4j.Logger;
导入org.slf4j.LoggerFactory;
公共类RDDUtil实现了可序列化
{
/**
*
*/
私有静态最终长serialVersionUID=1914105980520925932L;
私有静态最终记录器log=LoggerFactory.getLogger(RDDUtil.class);
public static Map histormap=new java.util.HashMap();
私有静态HashMap历史图0;
私有静态java.util.HashMap Historogram1;
私有静态java.util.HashMap Historogrambase;
私有静态int minValue=0;
私有静态int maxValue=0;
公共静态int getMinValue(){
返回最小值;
}
公共静态int getMaxValue(){
返回最大值;
}
专用静态void decideBin(双标签、双箱)
{
int vCount=0;
错误(“bin的这个值是{},标签是{}”,bin,label);
histogramBase.put(bin.toString(),0);
如果(标签==0.0){
赋值零(Historogram1,bin.toString());
如果(!checkIfPresent(historogram0,bin.toString())){
vCount++;
Historogram0.put(bin.toString(),vCount);
}
}
否则{
赋值零(Historogram0,bin.toString());
如果(!checkIfPresent(historogram1,bin.toString())){
vCount++;
histogram1.put(bin.toString(),vCount);
}
}
}
私有静态布尔checkIfPresent(java.util.HashMap直方图,字符串bin)
{
if(直方图容器(箱)){
int值=直方图.get(bin);
值++;
直方图。放置(箱、值);
返回true;
}
否则返回false;
}
私有静态void assignZero(java.util.HashMap直方图,字符串bin)
{
如果(!histogram.containsKey(bin)){
直方图。放置(bin,0);
}
}
专用静态无效计算器(双箱)
{
int tempValue=bin.intValue();
如果(minValue>tempValue)minValue=tempValue;
}
专用静态void calculateMax(双箱)
{
int tempValue=bin.intValue();
如果(tempValue>maxValue)maxValue=tempValue;
}
私有静态JavaRDD getJavaRDDMap(RDD-RDD,inti)
{
long val=rdd.count();
返回rdd.toJavaRDD().map(新函数(){
私有静态最终长serialVersionUID=-7619643561853615982L;
@凌驾
公共地图调用(标签点p){
双标签=(双)p.productElement(0);
向量v=(向量)p.productElement(1);
p、 特征();
双箱=0.0;
int vSize=p.features().size();
如果(i快速建议:在提交给spark之前尝试重新编译jar
你能提供更多关于你如何提交spark工作的细节吗
将您提供的代码与堆栈跟踪进行比较,行号似乎没有对齐。例如,RDDUtil第19行是注释字符串(“*/”),129是空字符串,102是“覆盖”。在提交spark作业之前,您是否尝试过重新编译jar?行号不匹配以及您的代码在IntelliJ中运行的事实可能是因为spark集群中运行的代码与您自己机器上运行的代码不同
我对这一点不是特别熟悉,但我相信有一种方法可以编辑您的原始帖子。如果您要向其中添加信息(例如您的代码),通常会首选该选项。以下是我的代码: