Apache spark 无法读取Spark中具有自定义一元转换器的管道模型_Apache Spark_Apache Spark Mllib

Apache spark 无法读取Spark中具有自定义一元转换器的管道模型

apache-spark

Apache spark 无法读取Spark中具有自定义一元转换器的管道模型,apache-spark,apache-spark-mllib,Apache Spark,Apache Spark Mllib,我在Spark中定义了一个新的自定义UnaryTransformer（示例代码中为cleanText），并在管道中使用它。当我保存已安装的管道并尝试将其读回时，会出现以下错误： java.lang.NoSuchMethodException:test_作业$cleanText.read（）当我保存并加载一元变压器时，它工作正常再现错误的示例代码（在Spark 2.2中测试）：可能与其中一个bug有关@Ioannis你能看一下并看看发布的解决方案是否适合你吗？谢谢两位的指点@c、鱼我确实用

我在Spark中定义了一个新的自定义UnaryTransformer（示例代码中为cleanText），并在管道中使用它。当我保存已安装的管道并尝试将其读回时，会出现以下错误：

java.lang.NoSuchMethodException:test_作业$cleanText.read（）

当我保存并加载一元变压器时，它工作正常

再现错误的示例代码（在Spark 2.2中测试）：

可能与其中一个bug有关@Ioannis你能看一下并看看发布的解决方案是否适合你吗？谢谢两位的指点@c、鱼我确实用了一个伴星。这绝对是一个火花问题。@loannis听起来好像Databricks可以解决这个问题，如果你提出一个支持票询问这个问题！你找到解决这个问题的办法了吗？

import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param.DoubleParam
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.types._
import org.apache.spark.ml.{PipelineModel}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{DataType, DataTypes}
import org.apache.spark.util.Utils
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.ml.param._


object test_job {

    class cleanText(override val uid: String) extends UnaryTransformer[String, String, cleanText] with DefaultParamsWritable {

    def this() = this(Identifiable.randomUID("cleantext"))

    override protected def validateInputType(inputType: DataType): Unit = {
        require(inputType == StringType)
    }

     protected def createTransformFunc: String => String = {
        val regex = "[^a-zA-Z0-9]".r
        s => regex.replaceAllIn(s,m=>" ")
     }

     protected def outputDataType: DataType = StringType

    }

    object cleanText extends DefaultParamsReadable[cleanText]
    //{
    //  override def load(path: String): cleanText = super.load(path)
    //}

    def main(args: Array[String]) {
          val sc: SparkContext = new SparkContext(new SparkConf().setAppName("test_job"))
    val sqlc = SparkSession.builder.appName("test_job").getOrCreate()
    import sqlc.implicits._

    val cleaner = new cleanText()
    cleaner.setInputCol("word").setOutputCol("r_clean")

    val someDF = sc.parallelize(Seq(
        (1, "sample text 1"),
        (2, "sample text 2"),
        (3, "sample text 3")
        )).toDF("number", "word")

    val pipeline = new Pipeline().setStages(Array(cleaner))

    val pipeline_fitted = pipeline.fit(someDF)
    pipeline_fitted.write.overwrite().save("/tmp/model/")
    //Saving just the transformer
    //cleaner.write.overwrite().save("/tmp/model/")
    println("Pipeline saved")

    val pl2 = PipelineModel.load("/tmp/model/")
    //Loading just the transformer will work
    //val cln = cleanText.load("/tmp/model/")

    println("Pipeline loaded")
    sqlc.stop()
  }

}