任务不可序列化错误-Spark Java
我的Spark应用程序中有以下代码。它应该从csv文件中过滤出基因。我正在将csv文件加载到spark RDD中。当我使用spark submit运行jar时,会出现Task not serializable异常任务不可序列化错误-Spark Java,java,csv,apache-spark,serialization,Java,Csv,Apache Spark,Serialization,我的Spark应用程序中有以下代码。它应该从csv文件中过滤出基因。我正在将csv文件加载到spark RDD中。当我使用spark submit运行jar时,会出现Task not serializable异常 public class AttributeSelector { public static final String path = System.getProperty("user.dir") + File.separator; public static Queu
public class AttributeSelector {
public static final String path = System.getProperty("user.dir") + File.separator;
public static Queue<Instances> result = new LinkedBlockingQueue<>();
private static final Logger LOGGER = LoggerFactory.getLogger(AttributeSelector.class);
int[] selectAttributes(Instances data) {
int[] indexes = null;
AttributeSelection filter = new AttributeSelection();
CfsSubsetEval evaluator = new CfsSubsetEval();
filter.setEvaluator(evaluator);
BestFirst search = new BestFirst();
filter.setSearch(search);
try {
filter.SelectAttributes(data);
indexes = filter.selectedAttributes();
} catch (Exception e) {
System.out.println("Error when resampling input data with selected attributes!");
e.printStackTrace();
}
return indexes;
}
public void selectData(Instances data, int[] indexes) {
Instances newData = data;
Remove remove = new Remove();
remove.setAttributeIndicesArray(indexes);
remove.setInvertSelection(true);
try {
remove.setInputFormat(data);
newData = Filter.useFilter(data, remove);
result.add(newData);
} catch (Exception e) {
e.printStackTrace();
}
}
private Instances getInputInstance(File fileName) {
CSVLoader loader = new CSVLoader();
Instances instance = null;
try {
loader.setSource(fileName);
instance = loader.getDataSet();
} catch (IOException e) {
e.printStackTrace();
}
return instance;
}
private void writeMergedOutput() {
LOGGER.info("Started merging results");
Instances finalResult = result.poll();
while (!result.isEmpty()) {
finalResult = Instances.mergeInstances(finalResult, result.poll());
}
try {
BufferedWriter writer = new BufferedWriter(new FileWriter(path + "Output" + ".arff"));
writer.write(finalResult.toString());
writer.flush();
writer.close();
} catch (Exception e) {
e.printStackTrace();
}
LOGGER.info("Finished merging results");
}
public static void main(String[] args) {
long start = System.currentTimeMillis();
try {
LOGGER.info("Loading data");
AttributeSelector attributeSelector = new AttributeSelector();
attributeSelector.run(path + "Parts");
} catch (Exception e) {
e.printStackTrace();
}
long end = System.currentTimeMillis();
LOGGER.info("Execution time: " + (end - start));
}
public void run(String sourceDir) {
String master = "local[*]";
SparkConf conf = new SparkConf()
.setAppName(AttributeSelector.class.getName())
.setMaster(master);
JavaSparkContext context = new JavaSparkContext(conf);
JavaFutureAction<Void> task = context.wholeTextFiles(sourceDir)
.foreachAsync(new VoidFunction<Tuple2<String,String>>(){
@Override
public void call(Tuple2<String, String> fileInfo) throws Exception {
File file = new File(fileInfo._1);
Instances instance = getInputInstance(file);
instance.setClassIndex(instance.numAttributes() - 1);
int[] indices = selectAttributes(instance);
selectData(instance, indices);
LOGGER.info("Finished executing: " + fileInfo._1);
}
});
while(!task.isDone()){
}
writeMergedOutput();
context.close();
}
}
公共类属性选择器{
公共静态最终字符串路径=System.getProperty(“user.dir”)+File.separator;
公共静态队列结果=新建LinkedBlockingQueue();
私有静态最终记录器Logger=LoggerFactory.getLogger(AttributeSelector.class);
int[]选择属性(实例数据){
int[]索引=null;
AttributeSelection筛选器=新建AttributeSelection();
CfsSubsetEval计算器=新的CfsSubsetEval();
filter.setEvaluator(evaluator);
BestFirst搜索=新的BestFirst();
filter.setSearch(搜索);
试一试{
过滤。选择属性(数据);
index=filter.selectedAttribute();
}捕获(例外e){
System.out.println(“使用选定属性重新采样输入数据时出错!”);
e、 printStackTrace();
}
收益指标;
}
公共void selectData(实例数据,int[]索引){
实例newData=data;
移除=新移除();
remove.setAttributeIndicesArray(索引);
remove.setInvertSelection(true);
试一试{
删除.setInputFormat(数据);
newData=Filter.useFilter(数据,删除);
结果。添加(新数据);
}捕获(例外e){
e、 printStackTrace();
}
}
私有实例getInputInstance(文件名){
CSVLoader loader=新CSVLoader();
实例实例=null;
试一试{
setSource(文件名);
instance=loader.getDataSet();
}捕获(IOE异常){
e、 printStackTrace();
}
返回实例;
}
私有void writeMergedOutput(){
LOGGER.info(“开始合并结果”);
实例finalResult=result.poll();
而(!result.isEmpty()){
finalResult=Instances.mergeInstances(finalResult,result.poll());
}
试一试{
BufferedWriter writer=新的BufferedWriter(新文件编写器(路径+“输出”+“.arff”);
writer.write(finalResult.toString());
writer.flush();
writer.close();
}捕获(例外e){
e、 printStackTrace();
}
LOGGER.info(“完成合并结果”);
}
公共静态void main(字符串[]args){
长启动=System.currentTimeMillis();
试一试{
LOGGER.info(“加载数据”);
AttributeSelector AttributeSelector=新的AttributeSelector();
attributeSelector.run(路径+“部件”);
}捕获(例外e){
e、 printStackTrace();
}
long end=System.currentTimeMillis();
info(“执行时间:”+(结束-开始));
}
公共void运行(字符串sourceDir){
字符串master=“local[*]”;
SparkConf conf=新的SparkConf()
.setAppName(AttributeSelector.class.getName())
.setMaster(master);
JavaSparkContext上下文=新的JavaSparkContext(conf);
JavaFutureAction任务=context.wholeTextFiles(sourceDir)
.foreachAsync(新的VoidFunction(){
@凌驾
公共void调用(Tuple2 fileInfo)引发异常{
File File=新文件(fileInfo.\u 1);
实例实例=getInputInstance(文件);
setClassIndex(instance.numAttributes()-1);
int[]index=selectAttributes(实例);
选择数据(实例、索引);
LOGGER.info(“已完成执行:+fileInfo._1”);
}
});
而(!task.isDone()){
}
WriteEmergedOutput();
context.close();
}
}
导致此异常的原因是什么?如何解决它
我得到的例外是
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2094)
at org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1.apply(AsyncRDDActions.scala:126)
at org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1.apply(AsyncRDDActions.scala:125)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.AsyncRDDActions.foreachAsync(AsyncRDDActions.scala:125)
at org.apache.spark.api.java.JavaRDDLike$class.foreachAsync(JavaRDDLike.scala:732)
at org.apache.spark.api.java.AbstractJavaRDDLike.foreachAsync(JavaRDDLike.scala:45)
at geneselection.AttributeSelector.run(AttributeSelector.java:129)
at geneselection.AttributeSelector.main(AttributeSelector.java:110)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:738)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:187)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:212)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:126)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.NotSerializableException: geneselection.AttributeSelector
Serialization stack:
- object not serializable (class: geneselection.AttributeSelector, value: geneselection.AttributeSelector@5d43409a)
- field (class: geneselection.AttributeSelector$1, name: this$0, type: class geneselection.AttributeSelector)
- object (class geneselection.AttributeSelector$1, geneselection.AttributeSelector$1@210308d5)
- field (class: org.apache.spark.api.java.JavaRDDLike$$anonfun$foreachAsync$1, name: f$15, type: interface org.apache.spark.api.java.function.VoidFunction)
- object (class org.apache.spark.api.java.JavaRDDLike$$anonfun$foreachAsync$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:295)
... 22 more
org.apache.spark.SparkException:任务不可序列化
位于org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
位于org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
位于org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
位于org.apache.spark.SparkContext.clean(SparkContext.scala:2094)
在org.apache.spark.rdd.asynchddactions$$anonfun$foreachAsync$1.apply上(asynchrddactions.scala:126)
在org.apache.spark.rdd.asynchddactions$$anonfun$foreachAsync$1.apply上(asynchrddactions.scala:125)
位于org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
位于org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
位于org.apache.spark.rdd.rdd.withScope(rdd.scala:362)
位于org.apache.spark.rdd.asynchrddactions.foreachAsync(asynchrddactions.scala:125)
位于org.apache.spark.api.java.JavaRDDLike$class.foreachAsync(JavaRDDLike.scala:732)
位于org.apache.spark.api.java.AbstractJavaRDDLike.foreachAsync(JavaRDDLike.scala:45)
运行(AttributeSelector.java:129)
位于geneselection.AttributeSelector.main(AttributeSelector.java:110)
在sun.reflect.NativeMethodAccessorImpl.invoke0(本机方法)处
位于sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
在sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)中
位于java.lang.reflect.Method.invoke(Method.java:498)
位于org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:738)
位于org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:187)
位于org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:212)
位于org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.s