Apache spark 无法读取Spark中具有自定义一元转换器的管道模型
我在Spark中定义了一个新的自定义UnaryTransformer(示例代码中为cleanText),并在管道中使用它。当我保存已安装的管道并尝试将其读回时,会出现以下错误: java.lang.NoSuchMethodException:test_作业$cleanText.read() 当我保存并加载一元变压器时,它工作正常 再现错误的示例代码(在Spark 2.2中测试):Apache spark 无法读取Spark中具有自定义一元转换器的管道模型,apache-spark,apache-spark-mllib,Apache Spark,Apache Spark Mllib,我在Spark中定义了一个新的自定义UnaryTransformer(示例代码中为cleanText),并在管道中使用它。当我保存已安装的管道并尝试将其读回时,会出现以下错误: java.lang.NoSuchMethodException:test_作业$cleanText.read() 当我保存并加载一元变压器时,它工作正常 再现错误的示例代码(在Spark 2.2中测试): 可能与其中一个bug有关@Ioannis你能看一下并看看发布的解决方案是否适合你吗?谢谢两位的指点@c、 鱼我确实用
可能与其中一个bug有关@Ioannis你能看一下并看看发布的解决方案是否适合你吗?谢谢两位的指点@c、 鱼我确实用了一个伴星。这绝对是一个火花问题。@loannis听起来好像Databricks可以解决这个问题,如果你提出一个支持票询问这个问题!你找到解决这个问题的办法了吗?
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param.DoubleParam
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.types._
import org.apache.spark.ml.{PipelineModel}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{DataType, DataTypes}
import org.apache.spark.util.Utils
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.ml.param._
object test_job {
class cleanText(override val uid: String) extends UnaryTransformer[String, String, cleanText] with DefaultParamsWritable {
def this() = this(Identifiable.randomUID("cleantext"))
override protected def validateInputType(inputType: DataType): Unit = {
require(inputType == StringType)
}
protected def createTransformFunc: String => String = {
val regex = "[^a-zA-Z0-9]".r
s => regex.replaceAllIn(s,m=>" ")
}
protected def outputDataType: DataType = StringType
}
object cleanText extends DefaultParamsReadable[cleanText]
//{
// override def load(path: String): cleanText = super.load(path)
//}
def main(args: Array[String]) {
val sc: SparkContext = new SparkContext(new SparkConf().setAppName("test_job"))
val sqlc = SparkSession.builder.appName("test_job").getOrCreate()
import sqlc.implicits._
val cleaner = new cleanText()
cleaner.setInputCol("word").setOutputCol("r_clean")
val someDF = sc.parallelize(Seq(
(1, "sample text 1"),
(2, "sample text 2"),
(3, "sample text 3")
)).toDF("number", "word")
val pipeline = new Pipeline().setStages(Array(cleaner))
val pipeline_fitted = pipeline.fit(someDF)
pipeline_fitted.write.overwrite().save("/tmp/model/")
//Saving just the transformer
//cleaner.write.overwrite().save("/tmp/model/")
println("Pipeline saved")
val pl2 = PipelineModel.load("/tmp/model/")
//Loading just the transformer will work
//val cln = cleanText.load("/tmp/model/")
println("Pipeline loaded")
sqlc.stop()
}
}