Apache spark 如何在UDF中创建数据帧

Apache spark 如何在UDF中创建数据帧,apache-spark,apache-spark-sql,Apache Spark,Apache Spark Sql,我有个问题。我想在UDF中创建一个数据帧,并使用我的模型将其转换为另一个数据帧。但我有个例外。火花形态有什么问题吗?我不知道。有人能帮我解决这个问题吗 代码: val model = PipelineModel.load("/user/abel/model/pipeline_model") val modelBroad = spark.sparkContext.broadcast(model) def model_predict(id:Long, text:String):Double = {

我有个问题。我想在UDF中创建一个数据帧,并使用我的模型将其转换为另一个数据帧。但我有个例外。火花形态有什么问题吗?我不知道。有人能帮我解决这个问题吗

代码:

val model = PipelineModel.load("/user/abel/model/pipeline_model")
val modelBroad = spark.sparkContext.broadcast(model)

def model_predict(id:Long, text:String):Double = {
  val modelLoaded = modelBroad.value
  val sparkss = SparkSession.builder.master("local[*]").getOrCreate()
  val dataDF = sparkss.createDataFrame(Seq((id,text))).toDF("id","text")
  val result = modelLoaded.transform(dataDF).select("prediction").collect().apply(0).getDouble(0)
  println(f"The prediction of $id and $text is $result")
  result
}

val udf_func = udf(model_predict _)
test.withColumn("prediction",udf_func($"id",$"text")).show()
Caused by: java.lang.NullPointerException
        at org.apache.spark.sql.execution.SparkPlan.sparkContext(SparkPlan.scala:56)
        at org.apache.spark.sql.execution.LocalTableScanExec.metrics$lzycompute(LocalTableScanExec.scala:37)
        at org.apache.spark.sql.execution.LocalTableScanExec.metrics(LocalTableScanExec.scala:36)
        at org.apache.spark.sql.execution.SparkPlan.resetMetrics(SparkPlan.scala:85)
        at org.apache.spark.sql.Dataset$$anonfun$withAction$1.apply(Dataset.scala:3366)
        at org.apache.spark.sql.Dataset$$anonfun$withAction$1.apply(Dataset.scala:3365)
        at org.apache.spark.sql.catalyst.trees.TreeNode.foreach(TreeNode.scala:117)
        at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3365)
        at org.apache.spark.sql.Dataset.collect(Dataset.scala:2788)
        at com.zamplus.mine.SparkSubmit$.com$zamplus$mine$SparkSubmit$$model_predict$1(SparkSubmit.scala:21)
        at com.zamplus.mine.SparkSubmit$$anonfun$1.apply(SparkSubmit.scala:40)
        at com.zamplus.mine.SparkSubmit$$anonfun$1.apply(SparkSubmit.scala:40)
        ... 22 more

例外情况:

val model = PipelineModel.load("/user/abel/model/pipeline_model")
val modelBroad = spark.sparkContext.broadcast(model)

def model_predict(id:Long, text:String):Double = {
  val modelLoaded = modelBroad.value
  val sparkss = SparkSession.builder.master("local[*]").getOrCreate()
  val dataDF = sparkss.createDataFrame(Seq((id,text))).toDF("id","text")
  val result = modelLoaded.transform(dataDF).select("prediction").collect().apply(0).getDouble(0)
  println(f"The prediction of $id and $text is $result")
  result
}

val udf_func = udf(model_predict _)
test.withColumn("prediction",udf_func($"id",$"text")).show()
Caused by: java.lang.NullPointerException
        at org.apache.spark.sql.execution.SparkPlan.sparkContext(SparkPlan.scala:56)
        at org.apache.spark.sql.execution.LocalTableScanExec.metrics$lzycompute(LocalTableScanExec.scala:37)
        at org.apache.spark.sql.execution.LocalTableScanExec.metrics(LocalTableScanExec.scala:36)
        at org.apache.spark.sql.execution.SparkPlan.resetMetrics(SparkPlan.scala:85)
        at org.apache.spark.sql.Dataset$$anonfun$withAction$1.apply(Dataset.scala:3366)
        at org.apache.spark.sql.Dataset$$anonfun$withAction$1.apply(Dataset.scala:3365)
        at org.apache.spark.sql.catalyst.trees.TreeNode.foreach(TreeNode.scala:117)
        at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3365)
        at org.apache.spark.sql.Dataset.collect(Dataset.scala:2788)
        at com.zamplus.mine.SparkSubmit$.com$zamplus$mine$SparkSubmit$$model_predict$1(SparkSubmit.scala:21)
        at com.zamplus.mine.SparkSubmit$$anonfun$1.apply(SparkSubmit.scala:40)
        at com.zamplus.mine.SparkSubmit$$anonfun$1.apply(SparkSubmit.scala:40)
        ... 22 more


你的UDF有问题。UDF在多个实例上运行,并使用我们在其中使用的所有变量。因此,您应该将所有必需的全局变量作为参数传递,例如
modelBroad
,否则它将给您
null指针异常

在UDF中,您没有遵循更多的好实践。其中包括:

  • 您不需要在UDF中创建
    spark会话
    。否则将创建多个spark会话,这将导致问题。如果需要,将全局spark会话作为UDF中的变量传递,而不是此过程
  • 删除UDF中不必要的pritnln,这也会影响您的回报
  • 我更改了你的代码,仅供参考。它只是理想UDF的一个原型。请相应地更改它

    val sparkss = SparkSession.builder.master("local[*]").getOrCreate()
    val model = PipelineModel.load("/user/abel/model/pipeline_model")
    val modelBroad = spark.sparkContext.broadcast(model)
    
    def model_predict(id:Long, text:String,spark:SparkSession,modelBroad:<datatype>):Double = {
      val modelLoaded = modelBroad.value
      val dataDF = spark.createDataFrame(Seq((id,text))).toDF("id","text")
      val result = modelLoaded.transform(dataDF).select("prediction").collect().apply(0).getDouble(0)
      result
    }
    
    val udf_func = udf(model_predict _)
    test.withColumn("prediction",udf_func($"id",$"text",lit(sparkss),lit(modelBroad))).show()
    
    val sparkss=SparkSession.builder.master(“local[*]”)。getOrCreate()
    val model=PipelineModel.load(“/user/abel/model/pipeline_model”)
    val modelBroad=spark.sparkContext.broadcast(模型)
    def model_predict(id:Long,text:String,spark:SparkSession,modelBroad:):Double={
    val modelLoaded=modelBroad.value
    val dataDF=spark.createDataFrame(Seq((id,text)).toDF(“id”,“text”)
    val result=modelLoaded.transform(dataDF)。选择(“预测”).collect().apply(0)。getDouble(0)
    结果
    }
    val udf_func=udf(模型预测)
    test.withColumn(“预测”,udf_func($“id”,“$”text),lit(sparkss),lit(modelBroad)).show()
    
    谢谢!但是我仍然有nullpointerException你能更新你的问题吗?