Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/apache-spark/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Scala 关于aggegateByKey的任务不可序列化_Scala_Apache Spark - Fatal编程技术网

Scala 关于aggegateByKey的任务不可序列化

Scala 关于aggegateByKey的任务不可序列化,scala,apache-spark,Scala,Apache Spark,环境:spark 1.60。我使用scala。 我可以通过sbt编译程序,但是当我提交程序时,它遇到了错误。 我的全部错误如下: 238 17/01/21 18:32:24 INFO net.NetworkTopology: Adding a new node: /YH11070029/10.39.0.213:50010 17/01/21 18:32:24 INFO storage.BlockManagerMasterEndpoint: Registering block manager 10

环境:spark 1.60。我使用scala。 我可以通过sbt编译程序,但是当我提交程序时,它遇到了错误。 我的全部错误如下:

238 17/01/21 18:32:24 INFO net.NetworkTopology: Adding a new node: /YH11070029/10.39.0.213:50010
17/01/21 18:32:24 INFO storage.BlockManagerMasterEndpoint: Registering block  manager 10.39.0.44:41961 with 2.7 GB RAM, BlockManagerId(349, 10.39.0.44, 41961)
17/01/21 18:32:24 INFO storage.BlockManagerMasterEndpoint: Registering block manager 10.39.2.178:48591 with 2.7 GB RAM, BlockManagerId(518, 10.39.2.178,  48591)
Exception in thread "main" org.apache.spark.SparkException: Task not     serializable
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
    at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
    at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
    at org.apache.spark.SparkContext.clean(SparkContext.scala:2055)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$combineByKeyWithClassTag$1.apply(PairRDDFunctions.scala:93)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$combineByKeyWithClassTag$1.apply(PairRDDFunctions.scala:82)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
    at org.apache.spark.rdd.PairRDDFunctions.combineByKeyWithClassTag(PairRDDFunctions.scala:82)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$aggregateByKey$1.apply(PairRDDFunctions.scala:177)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$aggregateByKey$1.apply(PairRDDFunctions.scala:166)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
    at org.apache.spark.rdd.PairRDDFunctions.aggregateByKey(PairRDDFunctions.scala:166)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$aggregateByKey$3.apply(PairRDDFunctions.scala:206)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$aggregateByKey$3.apply(PairRDDFunctions.scala:206)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
    at org.apache.spark.rdd.PairRDDFunctions.aggregateByKey(PairRDDFunctions.scala:205)
    at com.sina.adalgo.feature.ETL$$anonfun$13.apply(ETL.scala:190)
    at com.sina.adalgo.feature.ETL$$anonfun$13.apply(ETL.scala:102)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
object ETL extends Serializable {
           ... ...


val cateList = featureData.map{v =>
    case (psid: String, label: String, cate_features: ParArray[String], media_features: String) =>
        val pair_feature = cate_features.zipWithIndex.map(x => (x._2, x._1))
        pair_feature
}.flatMap(_.toList)

def seqop(m: HashMap[String, Int] , s: String) : HashMap[String, Int]={
    var x = m.getOrElse(s, 0)
    x += 1
    m += s -> x
    m   
}   

def combop(m: HashMap[String, Int], n: HashMap[String, Int]) : HashMap[String, Int]={
    for (k <- n) {
        var x = m.getOrElse(k._1, 0)
        x += k._2
        m += k._1 -> x
    }   
    m   
}   

val hash = HashMap[String, Int]()
val feaFreq = cateList.aggregateByKey(hash)(seqop, combop)// (i, HashMap[String, Int]) i corresponded with categorical feature
编码的目的是统计分类特征的出现频率。主要代码如下:

238 17/01/21 18:32:24 INFO net.NetworkTopology: Adding a new node: /YH11070029/10.39.0.213:50010
17/01/21 18:32:24 INFO storage.BlockManagerMasterEndpoint: Registering block  manager 10.39.0.44:41961 with 2.7 GB RAM, BlockManagerId(349, 10.39.0.44, 41961)
17/01/21 18:32:24 INFO storage.BlockManagerMasterEndpoint: Registering block manager 10.39.2.178:48591 with 2.7 GB RAM, BlockManagerId(518, 10.39.2.178,  48591)
Exception in thread "main" org.apache.spark.SparkException: Task not     serializable
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
    at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
    at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
    at org.apache.spark.SparkContext.clean(SparkContext.scala:2055)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$combineByKeyWithClassTag$1.apply(PairRDDFunctions.scala:93)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$combineByKeyWithClassTag$1.apply(PairRDDFunctions.scala:82)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
    at org.apache.spark.rdd.PairRDDFunctions.combineByKeyWithClassTag(PairRDDFunctions.scala:82)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$aggregateByKey$1.apply(PairRDDFunctions.scala:177)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$aggregateByKey$1.apply(PairRDDFunctions.scala:166)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
    at org.apache.spark.rdd.PairRDDFunctions.aggregateByKey(PairRDDFunctions.scala:166)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$aggregateByKey$3.apply(PairRDDFunctions.scala:206)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$aggregateByKey$3.apply(PairRDDFunctions.scala:206)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
    at org.apache.spark.rdd.PairRDDFunctions.aggregateByKey(PairRDDFunctions.scala:205)
    at com.sina.adalgo.feature.ETL$$anonfun$13.apply(ETL.scala:190)
    at com.sina.adalgo.feature.ETL$$anonfun$13.apply(ETL.scala:102)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
object ETL extends Serializable {
           ... ...


val cateList = featureData.map{v =>
    case (psid: String, label: String, cate_features: ParArray[String], media_features: String) =>
        val pair_feature = cate_features.zipWithIndex.map(x => (x._2, x._1))
        pair_feature
}.flatMap(_.toList)

def seqop(m: HashMap[String, Int] , s: String) : HashMap[String, Int]={
    var x = m.getOrElse(s, 0)
    x += 1
    m += s -> x
    m   
}   

def combop(m: HashMap[String, Int], n: HashMap[String, Int]) : HashMap[String, Int]={
    for (k <- n) {
        var x = m.getOrElse(k._1, 0)
        x += k._2
        m += k._1 -> x
    }   
    m   
}   

val hash = HashMap[String, Int]()
val feaFreq = cateList.aggregateByKey(hash)(seqop, combop)// (i, HashMap[String, Int]) i corresponded with categorical feature
对象ETL扩展可序列化{
... ...
val cateList=featureData.map{v=>
大小写(psid:String,label:String,cate_功能:ParArray[String],media_功能:String)=>
val pair_feature=cate_features.zipWithIndex.map(x=>(x._2,x._1))
配对特征
}.flatMap(u.toList)
def seqop(m:HashMap[String,Int],s:String):HashMap[String,Int]={
var x=m.getOrElse(s,0)
x+=1
m+=s->x
M
}   
def combop(m:HashMap[String,Int],n:HashMap[String,Int]):HashMap[String,Int]={
对于(kx
}   
M
}   
val hash=HashMap[String,Int]()
val feaFreq=cateList.aggregateByKey(hash)(seqop,combop)/(i,HashMap[String,Int])i与分类特征对应
该对象已继承可序列化的。
为什么?你能帮我吗?

对我来说,这个问题通常发生在Spark中,当我们使用闭包作为聚合函数时,它会无意地关闭一些不需要的对象和/或有时仅仅是Spark驱动程序代码的主类中的函数

我怀疑这里可能是这种情况,因为stacktrace涉及到
org.apache.spark.util.ClosureCleaner
作为顶级罪魁祸首

这是有问题的,因为在这种情况下,当Spark试图将该函数转发给Worker以便他们能够进行实际聚合时,它最终序列化的内容远远超出了您的实际意图:函数itelf及其周围的类

另请参见下文,其中详细解释了关闭序列化的一些边界案例


快速修复方法可能是将您在
aggregateByKey
中使用的函数的定义移动到一个单独的对象,完全独立于代码的其余部分。

您可以添加代码吗?除了异常情况,无法查看问题的原因。“任务不可序列化”。检测您自己的代码是否有不可序列化的对象。代码已显示。我已检查对象是否可序列化。tks。我曾尝试将您在aggregateByKey中使用的函数的定义移动到单独的对象,但它不起作用。我的代码已发布,我怀疑哈希(HashMap[String,Int])是否有问题感谢您的反馈和代码。尝试将
seqop
combop
移动到一个单独的对象中,在该对象中和
ETL
之外没有任何其他内容。我认为这里发生的是整个
ETL
对象被拉入序列化,因为它包含
cateList
feaFreq
,这是RDD,序列化失败。