如何解决错误java.io.NotSerializableException处理数据帧?
日志如下所示:如何解决错误java.io.NotSerializableException处理数据帧?,java,scala,apache-spark,Java,Scala,Apache Spark,日志如下所示: def URLEnc(input: String): String = { URLEncoder.encode(input, "UTF-8") } val URLEncUDF: UserDefinedFunction = udf(URLEnc(_: String)) val file = spark.read.format("xml") .option("rootTag", "chan
def URLEnc(input: String): String = {
URLEncoder.encode(input, "UTF-8")
}
val URLEncUDF: UserDefinedFunction = udf(URLEnc(_: String))
val file = spark.read.format("xml")
.option("rootTag", "channel").option("rowTag", "item")
.load("path")
where file is of xml format
val file1 = file.withColumn("description", URLEncUDF(col("g:description")))
线程“main”org.apache.spark.SparkException中的异常:任务不可序列化
位于org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:416)
位于org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:406)
位于org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:162)
位于org.apache.spark.SparkContext.clean(SparkContext.scala:2362)
在org.apache.spark.rdd.rdd.$anonfun$mapPartitionsWithIndex$1(rdd.scala:886)
位于org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
位于org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
位于org.apache.spark.rdd.rdd.withScope(rdd.scala:388)
位于org.apache.spark.rdd.rdd.mapPartitionsWithIndex(rdd.scala:885)
位于org.apache.spark.sql.execution.whisttagecodegenexec.doExecute(whisttagecodegenexec.scala:723)
位于org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
位于org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
位于org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
位于org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
位于org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
位于org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:316)
位于org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:434)
位于org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:420)
位于org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
位于org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3627)
位于org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2697)
在org.apache.spark.sql.Dataset.$anonfun$上,操作$1(Dataset.scala:3618)
位于org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
在org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
位于org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
位于org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
位于org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
位于org.apache.spark.sql.Dataset.withAction(Dataset.scala:3616)
位于org.apache.spark.sql.Dataset.head(Dataset.scala:2697)
位于org.apache.spark.sql.Dataset.take(Dataset.scala:2904)
位于org.apache.spark.sql.Dataset.getRows(Dataset.scala:300)
位于org.apache.spark.sql.Dataset.showString(Dataset.scala:337)
在org.apache.spark.sql.Dataset.show(Dataset.scala:826)上
在org.apache.spark.sql.Dataset.show(Dataset.scala:803)上
在AIFeed。(AIFeed.scala:16)
在AIFeed$.delayedEndpoint$AIFeed$1(AIFeed.scala:113)
在AIFeed$delayedInit$body.apply(AIFeed.scala:112)
在scala.Function0.apply$mcV$sp处(Function0.scala:39)
在scala.Function0.apply$mcV$sp$(Function0.scala:39)
在scala.runtime.AbstractFunction0.apply$mcV$sp(AbstractFunction0.scala:17)
在scala.App.$anonfun$main$1$adapted(App.scala:80)
位于scala.collection.immutable.List.foreach(List.scala:392)
位于scala.App.main(App.scala:80)
位于scala.App.main$(App.scala:78)
在AIFeed$.main(AIFeed.scala:112)
位于AIFeed.main(AIFeed.scala)
原因:java.io.NotSerializableException:AIFeed
序列化堆栈:
-对象不可序列化(类:AIFeed,值:AIFeed@5bccef9f)
-数组的元素(索引:0)
-数组(类[Ljava.lang.Object;,大小1)
-字段(类:java.lang.invoke.SerializedLambda,名称:capturedArgs,类型:class[Ljava.lang.Object;)
-对象(类java.lang.invoke.SerializedLambda,SerializedLambda[capturingClass=class FeedFunction,FunctionInterfaceMethod=scala/Function1.apply:(Ljava/lang/object;)Ljava/lang/object;,实现=invokeStatic FeedFunction.$anonfun$URLEncUDF$1:(LFeedFunction;Ljava/lang/String;)Ljava/lang/String;,实例化方法类型=(Ljava/lang/String;)Ljava/lang/String;,numCaptured=1])
-writeReplace数据(类:java.lang.invoke.SerializedLambda)
-对象(类FeedFunction$$Lambda$275/1443173326,FeedFunction$$Lambda$275/1443173326@51e94b7d)
-数组元素(索引:5)
-数组(类[Ljava.lang.Object;,大小6)
-数组的元素(索引:1)
-数组(类[Ljava.lang.Object;,大小3)
-字段(类:java.lang.invoke.SerializedLambda,名称:capturedArgs,类型:class[Ljava.lang.Object;)
-对象(类java.lang.invoke.SerializedLambda,SerializedLambda[capturingClass=class org.apache.spark.sql.execution.WhistAgeCodeGenexec,FunctionInterfaceMethod=scala/Function2.apply:(Ljava/lang/object;Ljava/lang/object;)Ljava/lang/Object;,implementation=invokeStatic org/apache/spark/sql/execution/whisttagecodegenexec.$anonfun$doExecute$4$adapted:(Lorg/apache/spark/sql/catalyst/expressions/codegen/codeandcoment;[Ljava/lang/Object;Lorg/apache/spark/sql/sql/execution/metric/SQLMetric;Ljava/lang/Object/Object;Lscala/collection/Lscala/collection/Iterator;,实例化方法类型=(Ljava/lang/Object;Lscala/collection/Iterator;)Lscala/collection/Iterator;,numcapture=3])
-writeReplace数据(类:java.lang.invoke.SerializedLambda)
-对象(类org.apache.spark.sql.execution.whisttagecodegenexec$$Lambda$2116/996471089,org.apache.spark.sql.execution.whisttagecodegenexec$$Lambda$2116/996471089@565a6af)
位于org.apache.spark.serializer.SerializationDebugger$.ImproveeException(SerializationDebugger.scala:41)
位于org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
位于org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
位于org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:413)
…还有45个
20/12/1
Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:416)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:406)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:162)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2362)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$1(RDD.scala:886)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
at org.apache.spark.rdd.RDD.mapPartitionsWithIndex(RDD.scala:885)
at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:723)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:316)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:434)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:420)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3627)
at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2697)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3618)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3616)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2697)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2904)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:300)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:337)
at org.apache.spark.sql.Dataset.show(Dataset.scala:826)
at org.apache.spark.sql.Dataset.show(Dataset.scala:803)
at AIFeed.<init>(AIFeed.scala:16)
at AIFeed$.delayedEndpoint$AIFeed$1(AIFeed.scala:113)
at AIFeed$delayedInit$body.apply(AIFeed.scala:112)
at scala.Function0.apply$mcV$sp(Function0.scala:39)
at scala.Function0.apply$mcV$sp$(Function0.scala:39)
at scala.runtime.AbstractFunction0.apply$mcV$sp(AbstractFunction0.scala:17)
at scala.App.$anonfun$main$1$adapted(App.scala:80)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.App.main(App.scala:80)
at scala.App.main$(App.scala:78)
at AIFeed$.main(AIFeed.scala:112)
at AIFeed.main(AIFeed.scala)
Caused by: java.io.NotSerializableException: AIFeed
Serialization stack:
- object not serializable (class: AIFeed, value: AIFeed@5bccef9f)
- element of array (index: 0)
- array (class [Ljava.lang.Object;, size 1)
- field (class: java.lang.invoke.SerializedLambda, name: capturedArgs, type: class [Ljava.lang.Object;)
- object (class java.lang.invoke.SerializedLambda, SerializedLambda[capturingClass=class FeedFunction, functionalInterfaceMethod=scala/Function1.apply:(Ljava/lang/Object;)Ljava/lang/Object;, implementation=invokeStatic FeedFunction.$anonfun$URLEncUDF$1:(LFeedFunction;Ljava/lang/String;)Ljava/lang/String;, instantiatedMethodType=(Ljava/lang/String;)Ljava/lang/String;, numCaptured=1])
- writeReplace data (class: java.lang.invoke.SerializedLambda)
- object (class FeedFunction$$Lambda$275/1443173326, FeedFunction$$Lambda$275/1443173326@51e94b7d)
- element of array (index: 5)
- array (class [Ljava.lang.Object;, size 6)
- element of array (index: 1)
- array (class [Ljava.lang.Object;, size 3)
- field (class: java.lang.invoke.SerializedLambda, name: capturedArgs, type: class [Ljava.lang.Object;)
- object (class java.lang.invoke.SerializedLambda, SerializedLambda[capturingClass=class org.apache.spark.sql.execution.WholeStageCodegenExec, functionalInterfaceMethod=scala/Function2.apply:(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;, implementation=invokeStatic org/apache/spark/sql/execution/WholeStageCodegenExec.$anonfun$doExecute$4$adapted:(Lorg/apache/spark/sql/catalyst/expressions/codegen/CodeAndComment;[Ljava/lang/Object;Lorg/apache/spark/sql/execution/metric/SQLMetric;Ljava/lang/Object;Lscala/collection/Iterator;)Lscala/collection/Iterator;, instantiatedMethodType=(Ljava/lang/Object;Lscala/collection/Iterator;)Lscala/collection/Iterator;, numCaptured=3])
- writeReplace data (class: java.lang.invoke.SerializedLambda)
- object (class org.apache.spark.sql.execution.WholeStageCodegenExec$$Lambda$2116/996471089, org.apache.spark.sql.execution.WholeStageCodegenExec$$Lambda$2116/996471089@565a6af)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:41)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:413)
... 45 more
20/12/16 17:55:15 INFO BlockManagerInfo: Removed broadcast_1_piece0 on 192.168.1.4:34511 in memory (size: 2.9 KiB, free: 1407.3 MiB)
20/12/16 17:55:15 INFO SparkContext: Invoking stop() from shutdown hook
20/12/16 17:55:15 INFO BlockManagerInfo: Removed broadcast_0_piece0 on 192.168.1.4:34511 in memory (size: 23.7 KiB, free: 1407.3 MiB)
20/12/16 17:55:15 INFO SparkUI: Stopped Spark web UI at http://192.168.1.4:4040
20/12/16 17:55:15 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
20/12/16 17:55:15 INFO MemoryStore: MemoryStore cleared
20/12/16 17:55:15 INFO BlockManager: BlockManager stopped
20/12/16 17:55:15 INFO BlockManagerMaster: BlockManagerMaster stopped
20/12/16 17:55:15 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
20/12/16 17:55:15 INFO SparkContext: Successfully stopped SparkContext
20/12/16 17:55:15 INFO ShutdownHookManager: Shutdown hook called
def someMeth(a:Int): Int = a + 1
val someFunc = (a: Int) => a + 1