Apache spark 如何在UDF中创建数据帧
我有个问题。我想在UDF中创建一个数据帧,并使用我的模型将其转换为另一个数据帧。但我有个例外。火花形态有什么问题吗?我不知道。有人能帮我解决这个问题吗 代码:Apache spark 如何在UDF中创建数据帧,apache-spark,apache-spark-sql,Apache Spark,Apache Spark Sql,我有个问题。我想在UDF中创建一个数据帧,并使用我的模型将其转换为另一个数据帧。但我有个例外。火花形态有什么问题吗?我不知道。有人能帮我解决这个问题吗 代码: val model = PipelineModel.load("/user/abel/model/pipeline_model") val modelBroad = spark.sparkContext.broadcast(model) def model_predict(id:Long, text:String):Double = {
val model = PipelineModel.load("/user/abel/model/pipeline_model")
val modelBroad = spark.sparkContext.broadcast(model)
def model_predict(id:Long, text:String):Double = {
val modelLoaded = modelBroad.value
val sparkss = SparkSession.builder.master("local[*]").getOrCreate()
val dataDF = sparkss.createDataFrame(Seq((id,text))).toDF("id","text")
val result = modelLoaded.transform(dataDF).select("prediction").collect().apply(0).getDouble(0)
println(f"The prediction of $id and $text is $result")
result
}
val udf_func = udf(model_predict _)
test.withColumn("prediction",udf_func($"id",$"text")).show()
Caused by: java.lang.NullPointerException
at org.apache.spark.sql.execution.SparkPlan.sparkContext(SparkPlan.scala:56)
at org.apache.spark.sql.execution.LocalTableScanExec.metrics$lzycompute(LocalTableScanExec.scala:37)
at org.apache.spark.sql.execution.LocalTableScanExec.metrics(LocalTableScanExec.scala:36)
at org.apache.spark.sql.execution.SparkPlan.resetMetrics(SparkPlan.scala:85)
at org.apache.spark.sql.Dataset$$anonfun$withAction$1.apply(Dataset.scala:3366)
at org.apache.spark.sql.Dataset$$anonfun$withAction$1.apply(Dataset.scala:3365)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreach(TreeNode.scala:117)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3365)
at org.apache.spark.sql.Dataset.collect(Dataset.scala:2788)
at com.zamplus.mine.SparkSubmit$.com$zamplus$mine$SparkSubmit$$model_predict$1(SparkSubmit.scala:21)
at com.zamplus.mine.SparkSubmit$$anonfun$1.apply(SparkSubmit.scala:40)
at com.zamplus.mine.SparkSubmit$$anonfun$1.apply(SparkSubmit.scala:40)
... 22 more
例外情况:
val model = PipelineModel.load("/user/abel/model/pipeline_model")
val modelBroad = spark.sparkContext.broadcast(model)
def model_predict(id:Long, text:String):Double = {
val modelLoaded = modelBroad.value
val sparkss = SparkSession.builder.master("local[*]").getOrCreate()
val dataDF = sparkss.createDataFrame(Seq((id,text))).toDF("id","text")
val result = modelLoaded.transform(dataDF).select("prediction").collect().apply(0).getDouble(0)
println(f"The prediction of $id and $text is $result")
result
}
val udf_func = udf(model_predict _)
test.withColumn("prediction",udf_func($"id",$"text")).show()
Caused by: java.lang.NullPointerException
at org.apache.spark.sql.execution.SparkPlan.sparkContext(SparkPlan.scala:56)
at org.apache.spark.sql.execution.LocalTableScanExec.metrics$lzycompute(LocalTableScanExec.scala:37)
at org.apache.spark.sql.execution.LocalTableScanExec.metrics(LocalTableScanExec.scala:36)
at org.apache.spark.sql.execution.SparkPlan.resetMetrics(SparkPlan.scala:85)
at org.apache.spark.sql.Dataset$$anonfun$withAction$1.apply(Dataset.scala:3366)
at org.apache.spark.sql.Dataset$$anonfun$withAction$1.apply(Dataset.scala:3365)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreach(TreeNode.scala:117)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3365)
at org.apache.spark.sql.Dataset.collect(Dataset.scala:2788)
at com.zamplus.mine.SparkSubmit$.com$zamplus$mine$SparkSubmit$$model_predict$1(SparkSubmit.scala:21)
at com.zamplus.mine.SparkSubmit$$anonfun$1.apply(SparkSubmit.scala:40)
at com.zamplus.mine.SparkSubmit$$anonfun$1.apply(SparkSubmit.scala:40)
... 22 more
你的UDF有问题。UDF在多个实例上运行,并使用我们在其中使用的所有变量。因此,您应该将所有必需的全局变量作为参数传递,例如
modelBroad
,否则它将给您null指针异常
在UDF中,您没有遵循更多的好实践。其中包括:
spark会话
。否则将创建多个spark会话,这将导致问题。如果需要,将全局spark会话作为UDF中的变量传递,而不是此过程val sparkss = SparkSession.builder.master("local[*]").getOrCreate()
val model = PipelineModel.load("/user/abel/model/pipeline_model")
val modelBroad = spark.sparkContext.broadcast(model)
def model_predict(id:Long, text:String,spark:SparkSession,modelBroad:<datatype>):Double = {
val modelLoaded = modelBroad.value
val dataDF = spark.createDataFrame(Seq((id,text))).toDF("id","text")
val result = modelLoaded.transform(dataDF).select("prediction").collect().apply(0).getDouble(0)
result
}
val udf_func = udf(model_predict _)
test.withColumn("prediction",udf_func($"id",$"text",lit(sparkss),lit(modelBroad))).show()
val sparkss=SparkSession.builder.master(“local[*]”)。getOrCreate()
val model=PipelineModel.load(“/user/abel/model/pipeline_model”)
val modelBroad=spark.sparkContext.broadcast(模型)
def model_predict(id:Long,text:String,spark:SparkSession,modelBroad:):Double={
val modelLoaded=modelBroad.value
val dataDF=spark.createDataFrame(Seq((id,text)).toDF(“id”,“text”)
val result=modelLoaded.transform(dataDF)。选择(“预测”).collect().apply(0)。getDouble(0)
结果
}
val udf_func=udf(模型预测)
test.withColumn(“预测”,udf_func($“id”,“$”text),lit(sparkss),lit(modelBroad)).show()
谢谢!但是我仍然有nullpointerException你能更新你的问题吗?