Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/scala/19.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Spark JavaPairdd中的空指针异常:(javaPairdd.scala:1028)_Java_Scala - Fatal编程技术网

Spark JavaPairdd中的空指针异常:(javaPairdd.scala:1028)

Spark JavaPairdd中的空指针异常:(javaPairdd.scala:1028),java,scala,Java,Scala,我得到了这个例外 Spark版本:2.0.2 17/05/22 13:47:30 ERROR Executor: Exception in task 0.3 in stage 28.0 (TID 33) java.lang.NullPointerException at com.baesystems.ai.analytics.util.RDDUtil.decideBin(RDDUtil.java:47) at com.baesystems.ai.analytics.uti

我得到了这个例外 Spark版本:2.0.2

    17/05/22 13:47:30 ERROR Executor: Exception in task 0.3 in stage 28.0 (TID 33)
java.lang.NullPointerException
    at com.baesystems.ai.analytics.util.RDDUtil.decideBin(RDDUtil.java:47)
    at com.baesystems.ai.analytics.util.RDDUtil.access$400(RDDUtil.java:19)
    at com.baesystems.ai.analytics.util.RDDUtil$1.call(RDDUtil.java:129)
    at com.baesystems.ai.analytics.util.RDDUtil$1.call(RDDUtil.java:102)
    at org.apache.spark.api.java.JavaPairRDD$$anonfun$toScalaFunction$1.apply(JavaPairRDD.scala:1028)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
    at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1765)
    at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1134)
    at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1134)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
    at org.apache.spark.scheduler.Task.run(Task.scala:86)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)
17/05/22 13:47:30 INFO CoarseGrainedExecutorBackend: Driver commanded a shutdown
17/05/22 13:47:30 INFO MemoryStore: MemoryStore cleared
ver commanded a shutdown
请告诉我这里发生了什么事。这段代码在我的IDE(Intellj)中运行良好;只有当我尝试在Spark Cluster上以独立模式运行此代码时,才会出现此问题


以下是实际代码:

import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.rdd.RDD;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


public class RDDUtil implements Serializable
{

    /**
     * 
     */
    private static final long serialVersionUID = 1914105980520925932L;
    private static final Logger log = LoggerFactory.getLogger(RDDUtil.class);

    public static Map<String,  java.util.HashMap<String, Integer>> histoMap = new java.util.HashMap<String,  java.util.HashMap<String, Integer>>();
    private static HashMap<String, Integer> histogram0;
    private static java.util.HashMap<String, Integer> histogram1;
    private static java.util.HashMap<String, Integer> histogramBase;
    private static int minValue=0;
    private static int maxValue=0;

    public static int getMinValue() {
        return minValue;
    }

    public static int getMaxValue() {
        return maxValue;
    }

    private static void decideBin(Double label, Double bin) 
    {
       int vCount=0;
        log.error("this value of bin is {} and the label is {}",bin,label);
        histogramBase.put(bin.toString(), 0);
        if(label==0.0) {
            assignZero(histogram1, bin.toString());
            if(!checkIfPresent(histogram0,bin.toString())) {
                vCount++;
                histogram0.put(bin.toString(),vCount);
            }   
         }
         else {
             assignZero(histogram0, bin.toString());

            if(!checkIfPresent(histogram1,bin.toString())) {
                vCount++;
                histogram1.put(bin.toString(),vCount);
            }
         }
     }

    private static boolean checkIfPresent(java.util.HashMap<String, Integer> histogram, String bin) 
    {

        if(histogram.containsKey(bin)) {

            int value = histogram.get(bin);

            value++;
            histogram.put(bin, value);
            return true;    
         }
        else return false;
    }

    private static void assignZero(java.util.HashMap<String, Integer> histogram, String bin)  
    {
        if(!histogram.containsKey(bin)) {
            histogram.put(bin, 0);
        }
    }
    private static void calculateMin(Double bin)
    {
        int tempValue = bin.intValue();
        if(minValue>tempValue) minValue=tempValue;

    }
    private static void calculateMax(Double bin)
    {
        int tempValue = bin.intValue();
        if(tempValue>maxValue) maxValue=tempValue;

    }

    private static JavaRDD<Map<String, HashMap<String, Integer>>> getJavaRDDMap(RDD<LabeledPoint> rdd,int i)
    {
        long val = rdd.count();

        return rdd.toJavaRDD().map(new Function<LabeledPoint,Map<String, HashMap<String, Integer>>>() {


            private static final long serialVersionUID = -7619643561853615982L;

            @Override
            public Map<String, HashMap<String, Integer>> call(LabeledPoint p) {


                Double label = (Double)p.productElement(0);
                Vector v = (Vector)p.productElement(1);

                p.features();
                Double bin =0.0;
                int vSize =p.features().size();
                if(i<vSize) {
                    bin = v.apply(i);
                }
                else {
                    bin = v.apply(vSize-1);
                }

                minValue=bin.intValue();
                calculateMin(bin);
                calculateMax(bin);

                log.error("this value of bin is {} and the iteration is {}",bin , i);
                decideBin(label,bin);

                histoMap.put("0.0", histogram0);
                histoMap.put("1.0", histogram1);
                histoMap.put("@@@BASE@@@", histogramBase);

                return  histoMap;
            }
        });
    }



    public static JavaRDD<Map<String, HashMap<String, Integer>>> computeHistogram(RDD<LabeledPoint> Data, int i)
    {


        histogram0 = new java.util.HashMap<String, Integer>();
        histogram1 = new java.util.HashMap<String, Integer>();
        histogramBase = new java.util.HashMap<String, Integer>();
        maxValue=0;

        JavaRDD<Map<String, HashMap<String, Integer>>> jRdd = getJavaRDDMap(Data,i);
        return jRdd;
    }

}
import java.io.Serializable;
导入java.util.HashMap;
导入java.util.Map;
导入org.apache.spark.api.java.JavaRDD;
导入org.apache.spark.api.java.function.function;
导入org.apache.spark.mllib.linalg.Vector;
导入org.apache.spark.mllib.regression.LabeledPoint;
导入org.apache.spark.rdd.rdd;
导入org.slf4j.Logger;
导入org.slf4j.LoggerFactory;
公共类RDDUtil实现了可序列化
{
/**
* 
*/
私有静态最终长serialVersionUID=1914105980520925932L;
私有静态最终记录器log=LoggerFactory.getLogger(RDDUtil.class);
public static Map histormap=new java.util.HashMap();
私有静态HashMap历史图0;
私有静态java.util.HashMap Historogram1;
私有静态java.util.HashMap Historogrambase;
私有静态int minValue=0;
私有静态int maxValue=0;
公共静态int getMinValue(){
返回最小值;
}
公共静态int getMaxValue(){
返回最大值;
}
专用静态void decideBin(双标签、双箱)
{
int vCount=0;
错误(“bin的这个值是{},标签是{}”,bin,label);
histogramBase.put(bin.toString(),0);
如果(标签==0.0){
赋值零(Historogram1,bin.toString());
如果(!checkIfPresent(historogram0,bin.toString())){
vCount++;
Historogram0.put(bin.toString(),vCount);
}   
}
否则{
赋值零(Historogram0,bin.toString());
如果(!checkIfPresent(historogram1,bin.toString())){
vCount++;
histogram1.put(bin.toString(),vCount);
}
}
}
私有静态布尔checkIfPresent(java.util.HashMap直方图,字符串bin)
{
if(直方图容器(箱)){
int值=直方图.get(bin);
值++;
直方图。放置(箱、值);
返回true;
}
否则返回false;
}
私有静态void assignZero(java.util.HashMap直方图,字符串bin)
{
如果(!histogram.containsKey(bin)){
直方图。放置(bin,0);
}
}
专用静态无效计算器(双箱)
{
int tempValue=bin.intValue();
如果(minValue>tempValue)minValue=tempValue;
}
专用静态void calculateMax(双箱)
{
int tempValue=bin.intValue();
如果(tempValue>maxValue)maxValue=tempValue;
}
私有静态JavaRDD getJavaRDDMap(RDD-RDD,inti)
{
long val=rdd.count();
返回rdd.toJavaRDD().map(新函数(){
私有静态最终长serialVersionUID=-7619643561853615982L;
@凌驾
公共地图调用(标签点p){
双标签=(双)p.productElement(0);
向量v=(向量)p.productElement(1);
p、 特征();
双箱=0.0;
int vSize=p.features().size();

如果(i快速建议:在提交给spark之前尝试重新编译jar

你能提供更多关于你如何提交spark工作的细节吗

将您提供的代码与堆栈跟踪进行比较,行号似乎没有对齐。例如,RDDUtil第19行是注释字符串(“*/”),129是空字符串,102是“覆盖”。在提交spark作业之前,您是否尝试过重新编译jar?行号不匹配以及您的代码在IntelliJ中运行的事实可能是因为spark集群中运行的代码与您自己机器上运行的代码不同


我对这一点不是特别熟悉,但我相信有一种方法可以编辑您的原始帖子。如果您要向其中添加信息(例如您的代码),通常会首选该选项。

以下是我的代码: