Warning: file_get_contents(/data/phpspider/zhask/data//catemap/5/url/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
任务不可序列化错误-Spark Java_Java_Csv_Apache Spark_Serialization - Fatal编程技术网

任务不可序列化错误-Spark Java

任务不可序列化错误-Spark Java,java,csv,apache-spark,serialization,Java,Csv,Apache Spark,Serialization,我的Spark应用程序中有以下代码。它应该从csv文件中过滤出基因。我正在将csv文件加载到spark RDD中。当我使用spark submit运行jar时,会出现Task not serializable异常 public class AttributeSelector { public static final String path = System.getProperty("user.dir") + File.separator; public static Queu

我的Spark应用程序中有以下代码。它应该从csv文件中过滤出基因。我正在将csv文件加载到spark RDD中。当我使用spark submit运行jar时,会出现Task not serializable异常

public class AttributeSelector {

    public static final String path = System.getProperty("user.dir") + File.separator;
    public static Queue<Instances> result = new LinkedBlockingQueue<>();
    private static final Logger LOGGER = LoggerFactory.getLogger(AttributeSelector.class);

    int[] selectAttributes(Instances data) {

        int[] indexes = null;
        AttributeSelection filter = new AttributeSelection();
        CfsSubsetEval evaluator = new CfsSubsetEval();
        filter.setEvaluator(evaluator);
        BestFirst search = new BestFirst();
        filter.setSearch(search);
        try {
            filter.SelectAttributes(data);
            indexes = filter.selectedAttributes();
        } catch (Exception e) {
            System.out.println("Error when resampling input data with selected attributes!");
            e.printStackTrace();
        }
        return indexes;

    }

    public void selectData(Instances data, int[] indexes) {

        Instances newData = data;
        Remove remove = new Remove();
        remove.setAttributeIndicesArray(indexes);
        remove.setInvertSelection(true);

        try {
            remove.setInputFormat(data);
            newData = Filter.useFilter(data, remove);
            result.add(newData);
        } catch (Exception e) {
            e.printStackTrace();
        }

    }

    private Instances getInputInstance(File fileName) {
        CSVLoader loader = new CSVLoader();
        Instances instance = null;
        try {
            loader.setSource(fileName);
            instance = loader.getDataSet();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return instance;
    }

    private void writeMergedOutput() {

        LOGGER.info("Started merging results");
        Instances finalResult = result.poll();

        while (!result.isEmpty()) {
            finalResult = Instances.mergeInstances(finalResult, result.poll());
        }

        try {
            BufferedWriter writer = new BufferedWriter(new FileWriter(path + "Output" + ".arff"));
            writer.write(finalResult.toString());
            writer.flush();
            writer.close();

        } catch (Exception e) {
            e.printStackTrace();
        }
        LOGGER.info("Finished merging results");
    }

    public static void main(String[] args) {
        long start = System.currentTimeMillis();
        try {
            LOGGER.info("Loading data");
            AttributeSelector attributeSelector = new AttributeSelector();
            attributeSelector.run(path + "Parts");

        } catch (Exception e) {
            e.printStackTrace();
        }
        long end = System.currentTimeMillis();
        LOGGER.info("Execution time: " + (end - start));
    }

    public void run(String sourceDir) {
        String master = "local[*]";

        SparkConf conf = new SparkConf()
                .setAppName(AttributeSelector.class.getName())
                .setMaster(master);

        JavaSparkContext context = new JavaSparkContext(conf);

        JavaFutureAction<Void> task = context.wholeTextFiles(sourceDir)
            .foreachAsync(new VoidFunction<Tuple2<String,String>>(){

                @Override
                public void call(Tuple2<String, String> fileInfo) throws Exception {
                    File file = new File(fileInfo._1);
                    Instances instance = getInputInstance(file);
                    instance.setClassIndex(instance.numAttributes() - 1);
                    int[] indices = selectAttributes(instance);
                    selectData(instance, indices);
                    LOGGER.info("Finished executing: " + fileInfo._1);
                }

        });

        while(!task.isDone()){

        }
        writeMergedOutput();

        context.close();
    }

}
公共类属性选择器{
公共静态最终字符串路径=System.getProperty(“user.dir”)+File.separator;
公共静态队列结果=新建LinkedBlockingQueue();
私有静态最终记录器Logger=LoggerFactory.getLogger(AttributeSelector.class);
int[]选择属性(实例数据){
int[]索引=null;
AttributeSelection筛选器=新建AttributeSelection();
CfsSubsetEval计算器=新的CfsSubsetEval();
filter.setEvaluator(evaluator);
BestFirst搜索=新的BestFirst();
filter.setSearch(搜索);
试一试{
过滤。选择属性(数据);
index=filter.selectedAttribute();
}捕获(例外e){
System.out.println(“使用选定属性重新采样输入数据时出错!”);
e、 printStackTrace();
}
收益指标;
}
公共void selectData(实例数据,int[]索引){
实例newData=data;
移除=新移除();
remove.setAttributeIndicesArray(索引);
remove.setInvertSelection(true);
试一试{
删除.setInputFormat(数据);
newData=Filter.useFilter(数据,删除);
结果。添加(新数据);
}捕获(例外e){
e、 printStackTrace();
}
}
私有实例getInputInstance(文件名){
CSVLoader loader=新CSVLoader();
实例实例=null;
试一试{
setSource(文件名);
instance=loader.getDataSet();
}捕获(IOE异常){
e、 printStackTrace();
}
返回实例;
}
私有void writeMergedOutput(){
LOGGER.info(“开始合并结果”);
实例finalResult=result.poll();
而(!result.isEmpty()){
finalResult=Instances.mergeInstances(finalResult,result.poll());
}
试一试{
BufferedWriter writer=新的BufferedWriter(新文件编写器(路径+“输出”+“.arff”);
writer.write(finalResult.toString());
writer.flush();
writer.close();
}捕获(例外e){
e、 printStackTrace();
}
LOGGER.info(“完成合并结果”);
}
公共静态void main(字符串[]args){
长启动=System.currentTimeMillis();
试一试{
LOGGER.info(“加载数据”);
AttributeSelector AttributeSelector=新的AttributeSelector();
attributeSelector.run(路径+“部件”);
}捕获(例外e){
e、 printStackTrace();
}
long end=System.currentTimeMillis();
info(“执行时间:”+(结束-开始));
}
公共void运行(字符串sourceDir){
字符串master=“local[*]”;
SparkConf conf=新的SparkConf()
.setAppName(AttributeSelector.class.getName())
.setMaster(master);
JavaSparkContext上下文=新的JavaSparkContext(conf);
JavaFutureAction任务=context.wholeTextFiles(sourceDir)
.foreachAsync(新的VoidFunction(){
@凌驾
公共void调用(Tuple2 fileInfo)引发异常{
File File=新文件(fileInfo.\u 1);
实例实例=getInputInstance(文件);
setClassIndex(instance.numAttributes()-1);
int[]index=selectAttributes(实例);
选择数据(实例、索引);
LOGGER.info(“已完成执行:+fileInfo._1”);
}
});
而(!task.isDone()){
}
WriteEmergedOutput();
context.close();
}
}
导致此异常的原因是什么?如何解决它

我得到的例外是

org.apache.spark.SparkException: Task not serializable
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
    at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
    at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
    at org.apache.spark.SparkContext.clean(SparkContext.scala:2094)
    at org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1.apply(AsyncRDDActions.scala:126)
    at org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1.apply(AsyncRDDActions.scala:125)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
    at org.apache.spark.rdd.AsyncRDDActions.foreachAsync(AsyncRDDActions.scala:125)
    at org.apache.spark.api.java.JavaRDDLike$class.foreachAsync(JavaRDDLike.scala:732)
    at org.apache.spark.api.java.AbstractJavaRDDLike.foreachAsync(JavaRDDLike.scala:45)
    at geneselection.AttributeSelector.run(AttributeSelector.java:129)
    at geneselection.AttributeSelector.main(AttributeSelector.java:110)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:738)
    at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:187)
    at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:212)
    at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:126)
    at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.NotSerializableException: geneselection.AttributeSelector
Serialization stack:
    - object not serializable (class: geneselection.AttributeSelector, value: geneselection.AttributeSelector@5d43409a)
    - field (class: geneselection.AttributeSelector$1, name: this$0, type: class geneselection.AttributeSelector)
    - object (class geneselection.AttributeSelector$1, geneselection.AttributeSelector$1@210308d5)
    - field (class: org.apache.spark.api.java.JavaRDDLike$$anonfun$foreachAsync$1, name: f$15, type: interface org.apache.spark.api.java.function.VoidFunction)
    - object (class org.apache.spark.api.java.JavaRDDLike$$anonfun$foreachAsync$1, <function1>)
    at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
    at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
    at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:295)
    ... 22 more
org.apache.spark.SparkException:任务不可序列化
位于org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
位于org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
位于org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
位于org.apache.spark.SparkContext.clean(SparkContext.scala:2094)
在org.apache.spark.rdd.asynchddactions$$anonfun$foreachAsync$1.apply上(asynchrddactions.scala:126)
在org.apache.spark.rdd.asynchddactions$$anonfun$foreachAsync$1.apply上(asynchrddactions.scala:125)
位于org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
位于org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
位于org.apache.spark.rdd.rdd.withScope(rdd.scala:362)
位于org.apache.spark.rdd.asynchrddactions.foreachAsync(asynchrddactions.scala:125)
位于org.apache.spark.api.java.JavaRDDLike$class.foreachAsync(JavaRDDLike.scala:732)
位于org.apache.spark.api.java.AbstractJavaRDDLike.foreachAsync(JavaRDDLike.scala:45)
运行(AttributeSelector.java:129)
位于geneselection.AttributeSelector.main(AttributeSelector.java:110)
在sun.reflect.NativeMethodAccessorImpl.invoke0(本机方法)处
位于sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
在sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)中
位于java.lang.reflect.Method.invoke(Method.java:498)
位于org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:738)
位于org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:187)
位于org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:212)
位于org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.s