Java ApacheSpark-简单字数获取:SparkException:任务不可序列化
我试图在Apache Spark(v1.3.0)上进行一些测试,我有一个简单的Java 8类:Java ApacheSpark-简单字数获取:SparkException:任务不可序列化,java,maven,apache-spark,Java,Maven,Apache Spark,我试图在Apache Spark(v1.3.0)上进行一些测试,我有一个简单的Java 8类: public class WordCount { private JavaSparkContext ctx; private String inputFile, outputFile; public WordCount(String inputFile, String outputFile) { this.inputFile = inputFile;
public class WordCount {
private JavaSparkContext ctx;
private String inputFile, outputFile;
public WordCount(String inputFile, String outputFile) {
this.inputFile = inputFile;
this.outputFile = outputFile;
// Initialize Spark Conf
ctx = new JavaSparkContext("local", "WordCount",
System.getenv("SPARK_HOME"), System.getenv("JARS"));
}
public static void main(String... args) {
String inputFile = "/home/workspace/spark/src/main/resources/inferno.txt";//args[0];
String outputFile = "/home/workspace/spark/src/main/resources/dv";//args[1];
WordCount wc = new WordCount(inputFile, outputFile);
wc.doWordCount();
wc.close();
}
public void doWordCount() {
long start = System.currentTimeMillis();
JavaRDD<String> inputRdd = ctx.textFile(inputFile);
JavaPairRDD<String, Integer> count = inputRdd.flatMapToPair((String s) -> {
List<Tuple2<String, Integer>> list = new ArrayList<>();
Arrays.asList(s.split(" ")).forEach(s1 -> list.add(new Tuple2<String, Integer>(s1, 1)));
return list;
}).reduceByKey((x, y) -> x + y);
List<Tuple2<String, Integer>> list = count.takeOrdered(10,
(o1, o2) -> o2._2() - o1._2());
list.forEach(t2 -> System.out.println(t2._1()));
// count.saveAsTextFile(outputFile);
long end = System.currentTimeMillis();
System.out.println(String.format("Time in ms is: %d", end - start));
}
public void close() {
ctx.stop();
}
}
好的,原因是您在处理过程中使用的所有类(即存储在RDD中的对象和作为要传递给spark的函数的类)都需要是可序列化的。这意味着它们需要实现
Serializable
接口,或者您必须提供另一种方法将它们序列化为Kryo。实际上,我不知道为什么您定义的lambda函数被认为是不可序列化的,但我相信这是由于Java比较器采用了一个参数
但是,使其工作的方法是定义一个可序列化的组件,例如:
public class WordCountComparator implements Comparator<Tuple2<String, Integer>>, Serializable {
@Override
public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {
// TODO Auto-generated method stub
return o2._2()-o1._2();
}
}
公共类WordCountComparator实现Comparator,可序列化{
@凌驾
公共整数比较(Tuple2 o1,Tuple2 o2){
//TODO自动生成的方法存根
返回o2._2()-o1._2();
}
}
然后将它的一个实例作为第二个参数传递给您的takeOrdered
函数。如果删除作为第二个参数传递给takeOrdered的lambda函数会发生什么?哇,它工作了。你认为是火花虫吗?也许我可以在他们的githubNo中打开一个问题我不这么认为,但我不知道为什么它会这样。。。我想原因是您必须将参数传递到比较器
接口,但我不确定。试试这个。正如@mark91提到的,传递的函数应该序列化。通过显式地提及lambda,可以将其序列化。List List=count.takeOrdered(10,(比较器和可序列化)(o1,o2)->o2.2()-o1.2());
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>it.conker.spark</groupId>
<artifactId>learning-spark-by-example</artifactId>
<modelVersion>4.0.0</modelVersion>
<name>Learning Spark by example</name>
<packaging>jar</packaging>
<version>0.0.1</version>
<dependencies>
<dependency> <!-- Spark dependency -->
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>1.3.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
</dependencies>
<properties>
<java.version>1.8</java.version>
</properties>
<build>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
List<Tuple2<String, Integer>> list = count.takeOrdered(10,
(o1, o2) -> o2._2() - o1._2());
List<Tuple2<String, Integer>> list = count.takeOrdered(10);
java.lang.ClassCastException: scala.Tuple2 cannot be cast to java.lang.Comparable
at org.spark-project.guava.collect.NaturalOrdering.compare(NaturalOrdering.java:28)
at scala.math.LowPriorityOrderingImplicits$$anon$7.compare(Ordering.scala:153)
at org.apache.spark.util.collection.Utils$$anon$1.compare(Utils.scala:35)
at org.spark-project.guava.collect.Ordering.leastOf(Ordering.java:672)
at org.apache.spark.util.collection.Utils$.takeOrdered(Utils.scala:37)
at org.apache.spark.rdd.RDD$$anonfun$34.apply(RDD.scala:1234)
at org.apache.spark.rdd.RDD$$anonfun$34.apply(RDD.scala:1231)
at org.apache.spark.rdd.RDD$$anonfun$14.apply(RDD.scala:634)
at org.apache.spark.rdd.RDD$$anonfun$14.apply(RDD.scala:634)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
at org.apache.spark.scheduler.Task.run(Task.scala:64)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
15/03/27 17:58:55 WARN TaskSetManager: Lost task 0.0 in stage 1.0 (TID 1, localhost): java.lang.ClassCastException: scala.Tuple2 cannot be cast to java.lang.Comparable
at org.spark-project.guava.collect.NaturalOrdering.compare(NaturalOrdering.java:28)
at scala.math.LowPriorityOrderingImplicits$$anon$7.compare(Ordering.scala:153)
at org.apache.spark.util.collection.Utils$$anon$1.compare(Utils.scala:35)
at org.spark-project.guava.collect.Ordering.leastOf(Ordering.java:672)
at org.apache.spark.util.collection.Utils$.takeOrdered(Utils.scala:37)
at org.apache.spark.rdd.RDD$$anonfun$34.apply(RDD.scala:1234)
at org.apache.spark.rdd.RDD$$anonfun$34.apply(RDD.scala:1231)
at org.apache.spark.rdd.RDD$$anonfun$14.apply(RDD.scala:634)
at org.apache.spark.rdd.RDD$$anonfun$14.apply(RDD.scala:634)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
at org.apache.spark.scheduler.Task.run(Task.scala:64)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
15/03/27 17:58:55 ERROR TaskSetManager: Task 0 in stage 1.0 failed 1 times; aborting job
15/03/27 17:58:55 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool
15/03/27 17:58:55 INFO TaskSchedulerImpl: Cancelling stage 1
15/03/27 17:58:56 INFO DAGScheduler: Job 0 failed: takeOrdered at WordCount.java:66, took 14.117721 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1.0 (TID 1, localhost): java.lang.ClassCastException: scala.Tuple2 cannot be cast to java.lang.Comparable
at org.spark-project.guava.collect.NaturalOrdering.compare(NaturalOrdering.java:28)
at scala.math.LowPriorityOrderingImplicits$$anon$7.compare(Ordering.scala:153)
at org.apache.spark.util.collection.Utils$$anon$1.compare(Utils.scala:35)
at org.spark-project.guava.collect.Ordering.leastOf(Ordering.java:672)
at org.apache.spark.util.collection.Utils$.takeOrdered(Utils.scala:37)
at org.apache.spark.rdd.RDD$$anonfun$34.apply(RDD.scala:1234)
at org.apache.spark.rdd.RDD$$anonfun$34.apply(RDD.scala:1231)
at org.apache.spark.rdd.RDD$$anonfun$14.apply(RDD.scala:634)
at org.apache.spark.rdd.RDD$$anonfun$14.apply(RDD.scala:634)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
at org.apache.spark.scheduler.Task.run(Task.scala:64)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1203)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1191)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1191)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
public class WordCountComparator implements Comparator<Tuple2<String, Integer>>, Serializable {
@Override
public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {
// TODO Auto-generated method stub
return o2._2()-o1._2();
}
}