Scala 如何解决用户定义函数的SparkException?
我想对数据集应用线性回归Scala 如何解决用户定义函数的SparkException?,scala,apache-spark,linear-regression,Scala,Apache Spark,Linear Regression,我想对数据集应用线性回归 val featureCols = Array("molecule_id", "group_id", "atom_id", "atom_id2", "mweight") val assembler = new VectorAssembler() .setInputCols(featureCols).setOutputCol("features") val df2 = assembler.transform(df) val labelIndexer = new
val featureCols = Array("molecule_id", "group_id", "atom_id", "atom_id2", "mweight")
val assembler = new VectorAssembler()
.setInputCols(featureCols).setOutputCol("features")
val df2 = assembler.transform(df)
val labelIndexer = new StringIndexer().setInputCol("logp").setOutputCol("label")
val df3 = labelIndexer.fit(df2).transform(df2)
val Array(trainingData, testData)= df3.randomSplit(Array(0.8, 0.2))
val linearRegression = new LinearRegression()
.setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
val linearRegressionModel = linearRegression.fit(trainingData)
19/09/16 13:09:54错误执行者:阶段29.0中任务0.0中出现异常
(TID 29)org.apache.spark.SparkException:无法执行用户
已定义函数($anonfun$9:(字符串)=>double)位于
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GenerateEditorForCodeGenStage5.processNext(未知
来源)在
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
在
org.apache.spark.sql.execution.whisttagecodegenexec$$anonfun$10$$anon$1.hasNext(whisttagecodegenexec.scala:614)
位于scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)位于
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)位于
scala.collection.Iterator$class.foreach(Iterator.scala:891)位于
scala.collection.AbstractIterator.foreach(迭代器.scala:1334)位于
scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157)
位于scala.collection.AbstractIterator.foldLeft(Iterator.scala:1334)
在
scala.collection.TraversableOnce$class.aggregate(TraversableOnce.scala:214)
位于scala.collection.AbstractIterator.aggregate(Iterator.scala:1334)
在
org.apache.spark.rdd.rdd$$anonfun$treeAggegate$1$$anonfun$23.apply(rdd.scala:1139)
在
org.apache.spark.rdd.rdd$$anonfun$treeAggegate$1$$anonfun$23.apply(rdd.scala:1139)
在
org.apache.spark.rdd.rdd$$anonfun$treeAggegate$1$$anonfun$24.apply(rdd.scala:1140)
在
org.apache.spark.rdd.rdd$$anonfun$treeAggegate$1$$anonfun$24.apply(rdd.scala:1140)
在
org.apache.spark.rdd.rdd$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(rdd.scala:800)
在
org.apache.spark.rdd.rdd$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(rdd.scala:800)
在
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
在org.apache.spark.rdd.rdd.computeOrReadCheckpoint(rdd.scala:324)
位于org.apache.spark.rdd.rdd.iterator(rdd.scala:288)
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)位于
org.apache.spark.scheduler.Task.run(Task.scala:109)位于
org.apache.spark.executor.executor$TaskRunner.run(executor.scala:345)
在
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
在
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
运行(Thread.java:748)
你能提供足够的代码来重现你的问题吗?(特别是以某种方式生成看起来像您正在操作的数据)感谢您的回复。错误是由于返回数据帧的私有方法引起的。我通过去掉那个函数解决了这个问题