Scala 向量汇编程序到稠密向量的显式转换_Scala_Apache Spark_Apache Spark Mllib

Scala 向量汇编程序到稠密向量的显式转换

scala apache-spark

Scala 向量汇编程序到稠密向量的显式转换,scala,apache-spark,apache-spark-mllib,Scala,Apache Spark,Apache Spark Mllib,如何将向量汇编程序的输出转换为密集向量而不是稀疏向量 val featureIndexer = new VectorAssembler().setInputCols(Array("feature1","feature2","feature3")).setOutputCol("indexedFeatures") training_set_combined = training_set_combined.na.fill(-9999) testing_set_combined = t

如何将向量汇编程序的输出转换为密集向量而不是稀疏向量

val featureIndexer = new VectorAssembler().setInputCols(Array("feature1","feature2","feature3")).setOutputCol("indexedFeatures")

    training_set_combined = training_set_combined.na.fill(-9999)
    testing_set_combined = testing_set_combined.na.fill(-9999)

    // training
    val assembler = new VectorAssembler().setInputCols(feature_names.toArray).setOutputCol("features")

    def get_param(): mutable.HashMap[String, Any] = {
        val params = new mutable.HashMap[String, Any]()
            params += "eta" -> 0.1f
            params += "num_round" -> 150
            params += "missing" -> -999
            params += "subsample" -> 1
            params += "objective" -> "binary:logistic"
        return params
    }

    val xgb = new XGBoostClassifier(get_param().toMap).setLabelCol("label").setFeaturesCol("features")
    val pipeline = new Pipeline().setStages(Array(assembler, xgb))
    val xgbclassifier = pipeline.fit(training_set_combined)

我希望将向量汇编程序转换为密集向量

这里是densevector transformer的实现-

import org.apache.spark.ml.Transformer
import org.apache.spark.ml.linalg.SQLDataTypes
import org.apache.spark.ml.param.shared.{HasInputCols, HasOutputCols}
import org.apache.spark.ml.param.{ParamMap, Params}
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions._


class DenseVectorConverter(val uid: String) extends Transformer with Params
  with HasInputCols with HasOutputCols with DefaultParamsWritable {
  def this() = this(Identifiable.randomUID("denseVectorConverter"))

  /** @group setParam */
  def setInputCols(value: Array[String]): this.type = set(inputCols, value)

  /** @group setParam */
  def setOutputCols(value: Array[String]): this.type = set(outputCols, value)

  def validateAndTransformSchema(schema: StructType): StructType = {
    require($(inputCols).length == $(inputCols).distinct.length, s"inputCols contains" +
      s" duplicates: (${$(inputCols).mkString(", ")})")
    require($(outputCols).length == $(outputCols).distinct.length, s"outputCols contains" +
      s" duplicates: (${$(outputCols).mkString(", ")})")
    require($(inputCols).length == $(outputCols).length, s"inputCols(${$(inputCols).length})" +
      s" and outputCols(${$(outputCols).length}) should have the same length")

    $(inputCols).zip($(outputCols)).foldLeft(schema) { (schema, inOutCol) =>
      val inputField = schema(inOutCol._1)
      require(inputField.dataType == SQLDataTypes.VectorType, s"Expected dtatype of input col: ${inputField.name} as " +
        s"vector but found ${inputField.dataType}")
      schema.add(inOutCol._2, inputField.dataType, inputField.nullable, inputField.metadata)
    }
  }

  def transformSchema(schema: StructType): StructType = validateAndTransformSchema(schema)

  def copy(extra: ParamMap): RenameColumns = defaultCopy(extra)

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val sparseToDense =
      udf((v: org.apache.spark.ml.linalg.Vector) => v.toDense)
    $(inputCols).zip($(outputCols)).foldLeft(dataset.toDF()) { (df, inputColOutputCol) =>

      df.withColumn(inputColOutputCol._2,
        sparseToDense(col(inputColOutputCol._1)));
    }
  }
}

 object DenseVectorConverter extends DefaultParamsReadable[DenseVectorConverter] {
  override def load(path: String): DenseVectorConverter = super.load(path)
}

我使用下面的测试用例进行了测试-

 import org.apache.spark.ml.linalg.Vectors

    val data = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
    )
    val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
    df.show(false)
//    +---------------------+
//    |features             |
//    +---------------------+
//    |(5,[1,3],[1.0,7.0])  |
//    |[2.0,0.0,3.0,4.0,5.0]|
//    |[4.0,0.0,0.0,6.0,7.0]|
//    +---------------------+
    val denseVectorConverter = new DenseVectorConverter()
      .setInputCols(Array("features"))
      .setOutputCols(Array("features_dense"))
    denseVectorConverter.transform(df).show(false)
//      +---------------------+---------------------+
//      |features             |features_dense       |
//      +---------------------+---------------------+
//      |(5,[1,3],[1.0,7.0])  |[0.0,1.0,0.0,7.0,0.0]|
//      |[2.0,0.0,3.0,4.0,5.0]|[2.0,0.0,3.0,4.0,5.0]|
//      |[4.0,0.0,0.0,6.0,7.0]|[4.0,0.0,0.0,6.0,7.0]|
//      +---------------------+---------------------+

现在，修改后的代码应该如下所示-

val featureIndexer = new VectorAssembler().setInputCols(Array("feature1","feature2","feature3")).setOutputCol("indexedFeatures")

    training_set_combined = training_set_combined.na.fill(-9999)
    testing_set_combined = testing_set_combined.na.fill(-9999)

    // training
    val assembler = new VectorAssembler().setInputCols(feature_names.toArray).setOutputCol("features")

val denseVectorConverter = new DenseVectorConverter()
      .setInputCols(Array("features"))
      .setOutputCols(Array("features_dense"))
    def get_param(): mutable.HashMap[String, Any] = {
        val params = new mutable.HashMap[String, Any]()
            params += "eta" -> 0.1f
            params += "num_round" -> 150
            params += "missing" -> -999
            params += "subsample" -> 1
            params += "objective" -> "binary:logistic"
        return params
    }

    val xgb = new XGBoostClassifier(get_param().toMap).setLabelCol("label").setFeaturesCol("features_dense")
    val pipeline = new Pipeline().setStages(Array(assembler, denseVectorConverter, xgb))
    val xgbclassifier = pipeline.fit(training_set_combined)

我已经修改了你的代码，请测试一次。

你好，Someshwar，谢谢你的帮助。我会尝试一下，你能帮我在下面的代码中如何适应向量汇编器的密集转换吗？嗨，Naveed，请创建一个转换器，将向量汇编器的输出列转换为密集列，比如说stage是denseConverter，并将管道更新为-val pipeline=new pipeline（）。setStages（数组（assembler，denseConverter，xgb））。请搜索示例变压器代码。希望这能对您有所帮助。请注意，我不确定您为什么要转换为稠密，但请注意稠密比稀疏占用更多内存。我想将向量从向量汇编器传递到XGBoost模型，但在最新版本的XGBoost模型中，它将0视为缺失值，但我在训练集中有0作为有意义的数据。xgboost将除0之外的任何内容视为缺失的唯一方法是转换为密集向量，并将任何其他值指定为缺失