来自RDD映射的Spark Scala序列化错误
我有一个RDD格式的RDD[((Long,Long),(Long,Long))],我需要转换成RDD[((Long,Long),(Long,Long,Long,Long))],其中第二个RDD元组基于第一个RDD的函数 我正在尝试实现这个基于地图的功能,但是,我认为我在这里做错了什么。请帮我解决这个问题 以下是完整的代码:来自RDD映射的Spark Scala序列化错误,scala,apache-spark,serialization,Scala,Apache Spark,Serialization,我有一个RDD格式的RDD[((Long,Long),(Long,Long))],我需要转换成RDD[((Long,Long),(Long,Long,Long,Long))],其中第二个RDD元组基于第一个RDD的函数 我正在尝试实现这个基于地图的功能,但是,我认为我在这里做错了什么。请帮我解决这个问题 以下是完整的代码: package com.ranker.correlation.listitem import org.apache.spark.SparkConf import org.ap
package com.ranker.correlation.listitem
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import scala.collection.Map
class ListItemCorrelation(sc: SparkContext) extends Serializable {
def up_down(dirX: Long, dirY: Long): (Long, Long, Long, Long) = {
if (dirX.equals(1)) {
if (dirY.equals(1)) {
return (1, 0, 0, 0)
} else {
return (0, 1, 0, 0)
}
} else {
if (dirY.equals(1)) {
return (0, 0, 1, 0)
} else {
return (0, 0, 0, 1)
}
}
}
def run(votes: String): RDD[((Long, Long), (Long, Long, Long, Long))] = {
val userVotes = sc.textFile(votes)
val userVotesPairs = userVotes.map { t =>
val p = t.split(",")
(p(0).toLong, (p(1).toLong, p(2).toLong))
}
val jn = userVotesPairs.join(userVotesPairs).values.filter(t => t._1._1.<(t._2._1))
val first = jn.map(t => ((t._1._1, t._2._1), (t._1._2, t._2._2)))
var second = first.map(t => ((t._1._1, t._2._1), up_down(t._1._2, t._2._2)))
//More functionality
return result
}
}
object ListItemCorrelation extends Serializable {
def main(args: Array[String]) {
val votes = args(0)
val conf = new SparkConf().setAppName("SparkJoins").setMaster("local")
val context = new SparkContext(conf)
val job = new ListItemCorrelation(context)
val results = job.run(votes)
val output = args(1)
results.saveAsTextFile(output)
context.stop()
}
}
package com.ranker.correlation.listitem
导入org.apache.spark.SparkConf
导入org.apache.spark.SparkContext
导入org.apache.spark.SparkContext_
导入org.apache.spark.rdd_
导入scala.collection.Map
类ListItemCorrelation(sc:SparkContext)扩展了可序列化{
def up_down(dirX:Long,dirY:Long):(Long,Long,Long){
if(dirX等于(1)){
如果(dirY等于(1)){
返回(1,0,0,0)
}否则{
返回(0,1,0,0)
}
}否则{
如果(dirY等于(1)){
返回(0,0,1,0)
}否则{
返回(0,0,0,1)
}
}
}
def run(投票:字符串):RDD[((长,长),(长,长,长,长))={
val uservoces=sc.textFile(投票)
val userVotesPairs=uservoals.map{t=>
val p=t.split(“,”)
(p(0).托龙,(p(1).托龙,p(2).托龙))
}
val jn=userVotesPairs.join(userVotesPairs.values.filter(t=>t.\u1.\u1,t.\u2.\u1),(t.\u1.\u2,t.\u2.))
var second=first.map(t=>((t.\u 1.\u 1,t.\u 2.\u 1),up.\u down(t.\u 1.\u 2,t.\u 2.\u 2)))
//更多功能
返回结果
}
}
对象ListItemCorrelation扩展了可序列化{
def main(参数:数组[字符串]){
val票数=args(0)
val conf=new SparkConf().setAppName(“SparkJoins”).setMaster(“本地”)
val context=新的SparkContext(conf)
val作业=新建ListItemCorrelation(上下文)
val结果=作业运行(投票)
val输出=args(1)
结果.saveAsTextFile(输出)
context.stop()
}
}
当我尝试运行此脚本时,出现以下错误:
线程“main”org.apache.spark.SparkException中的异常:任务未完成
可序列化于
org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
在
org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
在
org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
位于org.apache.spark.SparkContext.clean(SparkContext.scala:2094)
org.apache.spark.rdd.rdd$$anonfun$map$1.apply(rdd.scala:370)位于
org.apache.spark.rdd.rdd$$anonfun$map$1.apply(rdd.scala:369)位于
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
在
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
在org.apache.spark.rdd.rdd.withScope(rdd.scala:362)上
org.apache.spark.rdd.rdd.map(rdd.scala:369)位于
com.ranker.correlation.listitem.ListItemCorrelation.run(ListItemCorrelation.scala:34)
在
com.ranker.correlation.listitem.ListItemCorrelation$.main(ListItemCorrelation.scala:47)
在
com.ranker.correlation.listitem.ListItemCorrelation.main(ListItemCorrelation.scala)
原因:java.io.NotSerializableException:
org.apache.spark.SparkContext序列化堆栈:
-对象不可序列化(类:org.apache.spark.SparkContext,值:org.apache.spark)。SparkContext@4248e66b)
-字段(类:com.ranker.correlation.listitem.ListItemCorrelation,名称:sc,类型:class org.apache.spark.SparkContext)
-对象(类com.ranker.correlation.listitem.ListItemCorrelation,com.ranker.correlation.listitem)。ListItemCorrelation@270b6b5e)
-字段(类:com.ranker.correlation.listitem.ListItemCorrelation$$anonfun$4,名称:
$outer,类型:class
com.ranker.correlation.listitem.ListItemCorrelation)
-对象(类com.ranker.correlation.listitem.ListItemCorrelation$$anonfun$4,
)在
org.apache.spark.serializer.SerializationDebugger$.ImproveeException(SerializationDebugger.scala:40)
在
org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
在
org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
在
org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:295)
... 还有12个
执行以下行时发生此错误:
var second=first.map(t=>((t._1._1,t._2._1),up_down(t._1._2),
t、 _2._2)))
我对scala非常陌生,请帮助我找到正确的方法。将
up\u down
方法放在伴生对象上。当在RDD闭包中访问任何类变量时,该类(以及其中的所有内容,如SparkContext)将被序列化。方法参数在这里算作类变量。使用静态对象可以解决以下问题:
package com.ranker.correlation.listitem
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import scala.collection.Map
object ListItemCorrelation {
def up_down(dirX: Long, dirY: Long): (Long, Long, Long, Long) = {
if (dirX.equals(1)) {
if (dirY.equals(1)) {
return (1, 0, 0, 0)
} else {
return (0, 1, 0, 0)
}
} else {
if (dirY.equals(1)) {
return (0, 0, 1, 0)
} else {
return (0, 0, 0, 1)
}
}
}
}
class ListItemCorrelation(sc: SparkContext) extends Serializable {
def run(votes: String): RDD[((Long, Long), (Long, Long, Long, Long))] = {
val userVotes = sc.textFile(votes)
val userVotesPairs = userVotes.map { t =>
val p = t.split(",")
(p(0).toLong, (p(1).toLong, p(2).toLong))
}
val jn = userVotesPairs.join(userVotesPairs).values.filter(t => t._1._1.<(t._2._1))
val first = jn.map(t => ((t._1._1, t._2._1), (t._1._2, t._2._2)))
var second = first.map(t => ((t._1._1, t._2._1), ListItemCorrelation.up_down(t._1._2, t._2._2)))
//More functionality
return result
}
}
object ListItemCorrelation extends Serializable {
def main(args: Array[String]) {
val votes = args(0)
val conf = new SparkConf().setAppName("SparkJoins").setMaster("local")
val context = new SparkContext(conf)
val job = new ListItemCorrelation(context)
val results = job.run(votes)
val output = args(1)
results.saveAsTextFile(output)
context.stop()
}
}
package com.ranker.correlation.listitem
导入org.apache.spark.SparkConf
导入org.apache.spark.SparkContext
导入org.apache.spark.SparkContext_
导入org.apache.spark.rdd_
导入scala.collection.Map
对象ListItemCorrelation{
def up_down(dirX:Long,dirY:Long):(Long,Long,Long){
if(dirX等于(1)){
如果(dirY等于(1)){
返回(1,0,0,0)
}否则{
返回(0,1,0,0)
}
}否则{
如果(dirY等于(1)){
返回(0,0,1,0)
}否则{
返回(0,0,0,1)
}
}
}
}
类ListItemCorrelation(sc:SparkContext)扩展了可序列化{
def run(投票:字符串):RDD[((长,长),(长,长,长,长))={
val uservoces=sc.textFile(投票)
val userVotesPairs=uservoals.map{t=>
val p=t.split(“,”)
(p(0).托龙,(p(1).托龙,p(2).托龙))
}
val jn=userVotesPairs.join(userVotesPairs.values.filter(t=>t.\u1.\u1,t.\u2.\u1),(t.\u1.\u2,t.\u2.))
var second=first.map(t=>((t.\u 1.\u 1,t.\u 2.\u 1),ListItemCorrelation.up.\u down(t.\u 1.\u 2,t.\u 2.\u 2)))
//更多功能
返回结果
}
}
对象ListItemCorrelation扩展了可序列化