Function 如何重新缩放spark/scala中移动中心的数字范围?

Function 如何重新缩放spark/scala中移动中心的数字范围?,function,apache-spark,machine-learning,logarithm,Function,Apache Spark,Machine Learning,Logarithm,spark中的哪个函数可以将-infinity范围内的值转换/重新缩放到+infinity或-2到130等,以定义最大值 在下面的示例中,我希望确保55是100,100+是0 之前|之后 45-55 | 90-100 35-44 | 80-89 … 100+或x(index)} val gt50=df.filter(“分数>=55”)。选择('id,('score*-1)。作为(“分数”)) val lt50=测向过滤器(“分数x(index)} val gt50=df.filter(“分数>=

spark中的哪个函数可以将
-infinity
范围内的值转换/重新缩放到
+infinity
-2
130
等,以定义最大值

在下面的示例中,我希望确保55是100,100+是0

之前|之后

45-55 | 90-100

35-44 | 80-89

100+或<0|0-5


有什么有用的吗

我能解决它,谢谢@user6910411的帮助。 您可以根据数据使用密集或稀疏向量,并使用
maxabscaler
替换
MinMaxScaler
,并使用
linalg.Vectors
DenseVector
提取值 其思想是在所需的中间值和反向比例点将数据拆分为一半,然后缩放两部分并合并DF

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.ml.feature.Normalizer
import org.apache.spark.ml.feature.MaxAbsScaler
import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.functions.udf

val vectorToColumn = udf{ (x: DenseVector, index: Int) => x(index) }

val gt50 = df.filter("score >= 55").select('id,('score * -1).as("score"))
val lt50 = df.filter("score < 55")

val assembler = new VectorAssembler()
.setInputCols(Array("score"))
.setOutputCol("features")

val ass_lt50 = assembler.transform(lt50)
val ass_gt50 = assembler.transform(gt50)

val scaler = new MinMaxScaler()
.setInputCol("features")
.setOutputCol("featuresScaled")
.setMax(100)
.setMin(0)

val feat_lt50 = scaler.fit(ass_lt50).transform(ass_lt50).drop('score)
val feat_gt50 = scaler.fit(ass_gt50).transform(ass_gt50).drop('score)

val scaled_lt50 = feat_lt50.select('id,round(
vectorToColumn(col("featuresScaled"),lit(0))).as("scaled_score"))

val scaled_gt50 = feat_gt50.select('id,round(
vectorToColumn(col("featuresScaled"),lit(0))).as("scaled_score"))

val scaled = scaled_lt50.unionAll(scaled_gt50)
import org.apache.spark.mllib.linalg.Vectors
导入org.apache.spark.ml.feature.Normalizer
导入org.apache.spark.ml.feature.maxabscaler
导入org.apache.spark.ml.feature.MinMaxScaler
导入org.apache.spark.ml.feature.VectorAssembler
导入org.apache.spark.ml.linalg.DenseVector
导入org.apache.spark.sql.functions.udf
val vectorocolumn=udf{(x:DenseVector,index:Int)=>x(index)}
val gt50=df.filter(“分数>=55”)。选择('id,('score*-1)。作为(“分数”))
val lt50=测向过滤器(“分数<55”)
val assembler=新向量汇编程序()
.setInputCols(数组(“分数”))
.setOutputCol(“特性”)
val ass_lt50=汇编程序.transform(lt50)
val ass_gt50=汇编程序.转换(gt50)
val scaler=new MinMaxScaler()
.setInputCol(“功能”)
.setOutputCol(“特性标度”)
.setMax(100)
.setMin(0)
val feat_lt50=缩放器.fit(ass_lt50).transform(ass_lt50).drop('score)
val feat_gt50=缩放器。调整(ass_gt50)。变换(ass_gt50)。下降('score)
val scaled_lt50=专长_lt50。选择('id,round(
向量列(列(“特征标度”),亮(0))。作为(“标度分数”))
val scaled_gt50=专长_gt50。选择('id,round(
向量列(列(“特征标度”),亮(0))。作为(“标度分数”))
val scaled=缩放的长度为50。unionAll(缩放的长度为50)

我能解决它,谢谢@user6910411的帮助。 您可以根据数据使用密集或稀疏向量,并使用
maxabscaler
替换
MinMaxScaler
,并使用
linalg.Vectors
DenseVector
提取值 其思想是在所需的中间值和反向比例点将数据拆分为一半,然后缩放两部分并合并DF

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.ml.feature.Normalizer
import org.apache.spark.ml.feature.MaxAbsScaler
import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.functions.udf

val vectorToColumn = udf{ (x: DenseVector, index: Int) => x(index) }

val gt50 = df.filter("score >= 55").select('id,('score * -1).as("score"))
val lt50 = df.filter("score < 55")

val assembler = new VectorAssembler()
.setInputCols(Array("score"))
.setOutputCol("features")

val ass_lt50 = assembler.transform(lt50)
val ass_gt50 = assembler.transform(gt50)

val scaler = new MinMaxScaler()
.setInputCol("features")
.setOutputCol("featuresScaled")
.setMax(100)
.setMin(0)

val feat_lt50 = scaler.fit(ass_lt50).transform(ass_lt50).drop('score)
val feat_gt50 = scaler.fit(ass_gt50).transform(ass_gt50).drop('score)

val scaled_lt50 = feat_lt50.select('id,round(
vectorToColumn(col("featuresScaled"),lit(0))).as("scaled_score"))

val scaled_gt50 = feat_gt50.select('id,round(
vectorToColumn(col("featuresScaled"),lit(0))).as("scaled_score"))

val scaled = scaled_lt50.unionAll(scaled_gt50)
import org.apache.spark.mllib.linalg.Vectors
导入org.apache.spark.ml.feature.Normalizer
导入org.apache.spark.ml.feature.maxabscaler
导入org.apache.spark.ml.feature.MinMaxScaler
导入org.apache.spark.ml.feature.VectorAssembler
导入org.apache.spark.ml.linalg.DenseVector
导入org.apache.spark.sql.functions.udf
val vectorocolumn=udf{(x:DenseVector,index:Int)=>x(index)}
val gt50=df.filter(“分数>=55”)。选择('id,('score*-1)。作为(“分数”))
val lt50=测向过滤器(“分数<55”)
val assembler=新向量汇编程序()
.setInputCols(数组(“分数”))
.setOutputCol(“特性”)
val ass_lt50=汇编程序.transform(lt50)
val ass_gt50=汇编程序.转换(gt50)
val scaler=new MinMaxScaler()
.setInputCol(“功能”)
.setOutputCol(“特性标度”)
.setMax(100)
.setMin(0)
val feat_lt50=缩放器.fit(ass_lt50).transform(ass_lt50).drop('score)
val feat_gt50=缩放器。调整(ass_gt50)。变换(ass_gt50)。下降('score)
val scaled_lt50=专长_lt50。选择('id,round(
向量列(列(“特征标度”),亮(0))。作为(“标度分数”))
val scaled_gt50=专长_gt50。选择('id,round(
向量列(列(“特征标度”),亮(0))。作为(“标度分数”))
val scaled=缩放的长度为50。unionAll(缩放的长度为50)