Apache spark Spark:任务不可序列化奇怪错误_Apache Spark_Serialization

Apache spark Spark:任务不可序列化奇怪错误

apache-spark serialization

Apache spark Spark:任务不可序列化奇怪错误,apache-spark,serialization,Apache Spark,Serialization,这不起作用，并抱怨序列化问题： import java.nio.charset.StandardCharsets import scala.util.Try import java.net.URLDecoder import spark.implicits._ val df = List("http%3A%2F%2Fforum.krasmama.ru%2Fviewforum.php%3Ff%3D247").toDF("URL") def parseU

这不起作用，并抱怨序列化问题：

import java.nio.charset.StandardCharsets
import scala.util.Try
import java.net.URLDecoder

import spark.implicits._

val df = List("http%3A%2F%2Fforum.krasmama.ru%2Fviewforum.php%3Ff%3D247").toDF("URL")

def parseURL(s: String): String = {
    Try(URLDecoder.decode(s, StandardCharsets.UTF_8.name())).toOption.getOrElse(null)
}

val parseURLudf = udf[String, String](parseURL)

val myCond = col("URL").startsWith("http")
val df2 = df.filter(myCond)
val dfWithParsedUrl = df2.withColumn("URL", parseURLudf(col("URL")))
dfWithParsedUrl.show(5, truncate=30)

但是，如果我消除了

myCond

变量，并将

col（“URL”）.startsWith（“http”）

直接粘贴到

过滤器中，它会工作，但为什么呢
以下是错误日志：
org.apache.spark.SparkException: Task not serializable
  at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:340)
  at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:330)
  at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:156)
  at org.apache.spark.SparkContext.clean(SparkContext.scala:2294)
  at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:841)
  at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:840)
  at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
  at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
  at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
  at org.apache.spark.rdd.RDD.mapPartitionsWithIndex(RDD.scala:840)
  at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:389)
  at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
  at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
  at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
  at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
  at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
  at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
  at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:228)
  at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:311)
  at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
  at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:2861)
  at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2150)
  at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2150)
  at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2842)
  at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
  at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2841)
  at org.apache.spark.sql.Dataset.head(Dataset.scala:2150)
  at org.apache.spark.sql.Dataset.take(Dataset.scala:2363)
  at org.apache.spark.sql.Dataset.showString(Dataset.scala:241)
  at org.apache.spark.sql.Dataset.show(Dataset.scala:661)
  ... 53 elided
Caused by: java.io.NotSerializableException: org.apache.spark.sql.Column
Serialization stack:
    - object not serializable (class: org.apache.spark.sql.Column, value: startswith(URL, http))
    - field (class: $iw, name: myCond, type: class org.apache.spark.sql.Column)
    - object (class $iw, $iw@720343cf)
    - field (class: $anonfun$1, name: $outer, type: class $iw)
    - object (class $anonfun$1, <function1>)
    - field (class: org.apache.spark.sql.catalyst.expressions.ScalaUDF$$anonfun$2, name: func$2, type: interface scala.Function1)
    - object (class org.apache.spark.sql.catalyst.expressions.ScalaUDF$$anonfun$2, <function1>)
    - field (class: org.apache.spark.sql.catalyst.expressions.ScalaUDF, name: f, type: interface scala.Function1)
    - object (class org.apache.spark.sql.catalyst.expressions.ScalaUDF, UDF(input[0, string, false]))
    - element of array (index: 2)
    - array (class [Ljava.lang.Object;, size 3)
    - field (class: org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8, name: references$1, type: class [Ljava.lang.Object;)
    - object (class org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8, <function2>)
  at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
  at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
  at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
  at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:337)
  ... 82 more

org.apache.spark.SparkException:任务不可序列化
位于org.apache.spark.util.ClosureCleaner$.ensureSerializable（ClosureCleaner.scala:340）
位于org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean（ClosureCleaner.scala:330）
位于org.apache.spark.util.ClosureCleaner$.clean（ClosureCleaner.scala:156）
位于org.apache.spark.SparkContext.clean（SparkContext.scala:2294）
在org.apache.spark.rdd.rdd$$anonfun$mapPartitionsWithIndex$1.apply上（rdd.scala:841）
位于org.apache.spark.rdd.rdd$$anonfun$mapPartitionsWithIndex$1.apply（rdd.scala:840）
位于org.apache.spark.rdd.RDDOperationScope$.withScope（RDDOperationScope.scala:151）
位于org.apache.spark.rdd.RDDOperationScope$.withScope（RDDOperationScope.scala:112）
位于org.apache.spark.rdd.rdd.withScope（rdd.scala:362）
位于org.apache.spark.rdd.rdd.mapPartitionsWithIndex（rdd.scala:840）
位于org.apache.spark.sql.execution.whisttagecodegenexec.doExecute（whisttagecodegenexec.scala:389）
位于org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply（SparkPlan.scala:117）
位于org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply（SparkPlan.scala:117）
位于org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply（SparkPlan.scala:138）
位于org.apache.spark.rdd.RDDOperationScope$.withScope（RDDOperationScope.scala:151）
位于org.apache.spark.sql.execution.SparkPlan.executeQuery（SparkPlan.scala:135）
位于org.apache.spark.sql.execution.SparkPlan.execute（SparkPlan.scala:116）
位于org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd（SparkPlan.scala:228）
位于org.apache.spark.sql.execution.SparkPlan.executeTake（SparkPlan.scala:311）
位于org.apache.spark.sql.execution.CollectLimitExec.executeCollect（limit.scala:38）
位于org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan（Dataset.scala:2861）
位于org.apache.spark.sql.Dataset$$anonfun$head$1.apply（Dataset.scala:2150）
位于org.apache.spark.sql.Dataset$$anonfun$head$1.apply（Dataset.scala:2150）
位于org.apache.spark.sql.Dataset$$anonfun$55.apply（Dataset.scala:2842）
位于org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId（SQLExecution.scala:65）
位于org.apache.spark.sql.Dataset.withAction（Dataset.scala:2841）
位于org.apache.spark.sql.Dataset.head（Dataset.scala:2150）
位于org.apache.spark.sql.Dataset.take（Dataset.scala:2363）
位于org.apache.spark.sql.Dataset.showString（Dataset.scala:241）
在org.apache.spark.sql.Dataset.show（Dataset.scala:661）上
... 53删去
原因：java.io.NotSerializableException:org.apache.spark.sql.Column
序列化堆栈：
-对象不可序列化（类：org.apache.spark.sql.Column，值：startswith（URL，http））
-字段（类：$iw，名称：myCond，类型：class org.apache.spark.sql.Column）
-对象（类$iw$iw@720343cf)
-字段（类：$anonfun$1，名称：$outer，类型：class$iw）
-对象（类$anonfun$1，）
-字段（类：org.apache.spark.sql.catalyst.expressions.ScalaUDF$$anonfun$2，名称：func$2，类型：interface scala.Function1）
-对象（类org.apache.spark.sql.catalyst.expressions.ScalaUDF$$anonfun$2，）
-字段（类：org.apache.spark.sql.catalyst.expressions.ScalaUDF，名称：f，类型：interface scala.Function1）
-对象（类org.apache.spark.sql.catalyst.expressions.ScalaUDF，UDF（输入[0，字符串，false]））
-数组元素（索引：2）
-数组（类[Ljava.lang.Object；，大小3）
-字段（类：org.apache.spark.sql.execution.whistagecodegenexec$$anonfun$8，名称：references$1，类型：class[Ljava.lang.Object；）
-对象（类org.apache.spark.sql.execution.whistagecodegenexec$$anonfun$8，）
位于org.apache.spark.serializer.SerializationDebugger$.ImproveeException（SerializationDebugger.scala:40）
位于org.apache.spark.serializer.JavaSerializationStream.writeObject（JavaSerializer.scala:46）
位于org.apache.spark.serializer.JavaSerializerInstance.serialize（JavaSerializer.scala:100）
位于org.apache.spark.util.ClosureCleaner$.ensureSerializable（ClosureCleaner.scala:337）
…还有82个

顺便说一句，我不知道这是否可以在本地复制，因为在这种情况下Spark不一定序列化任何东西？（如果它只在需要从驱动程序向执行器发送代码时序列化，对不起，我不知道它如何工作的细节）.
返回给您的序列化异常消息是什么？在第一次查看时，我会说val myCond=col（“URL”）.startsWith（“http”）
使用val
声明。你猜你收到的是一个布尔值，但事实并非如此？如果你在屏幕上打印myCond
变量，它的内容和类型是什么？myCond的类型是Column。我将在问题中添加错误消息