Apache spark 寻找以原始数据作为输入的流式Spark ML示例
我是Spark ML的新手。我正在寻找以原始数据作为输入的流式ML示例(我指的是原始字符串分隔的数据,而不是矢量化数据)。 我试着在大多数论坛上寻找类似的例子,但没有找到 因此,我采用以下方法在Spark 1.6上对Kmeans进行流式处理(流式Kmeans仍然适用于矢量数据,而不是数据帧), 但我不确定这是不是正确的方法Apache spark 寻找以原始数据作为输入的流式Spark ML示例,apache-spark,machine-learning,spark-streaming,k-means,apache-spark-mllib,Apache Spark,Machine Learning,Spark Streaming,K Means,Apache Spark Mllib,我是Spark ML的新手。我正在寻找以原始数据作为输入的流式ML示例(我指的是原始字符串分隔的数据,而不是矢量化数据)。 我试着在大多数论坛上寻找类似的例子,但没有找到 因此,我采用以下方法在Spark 1.6上对Kmeans进行流式处理(流式Kmeans仍然适用于矢量数据,而不是数据帧), 但我不确定这是不是正确的方法 输入来自卡夫卡的数据流记录,我将其转换为数据帧 构建了一个预数据管道来读取我需要转换为向量的列 因为在每个流批处理中,数据是不同的,向量长度可能不同,所以,我使用标记器和H
- 输入来自卡夫卡的数据流记录,我将其转换为数据帧
- 构建了一个预数据管道来读取我需要转换为向量的列
- 因为在每个流批处理中,数据是不同的,向量长度可能不同,所以,我使用标记器和HashingTF来保持恒定的向量长度
- 同样在每个流批处理中,为了确定向量属于哪一行,我分配了行号并将其发送给流Kmeans算法
- 在获得每行的集群ID之后,我将集群ID加入到行数据中,以获得最终的预测数据帧
- 据我所知,使用具有固定向量长度(setNumFeatures)的HashingTF无法解决我的问题,原因是相同的向量可能会再次重复,如果向量的组合少于单个批次中的行数(在我下面的代码中将其硬编码为200)
- 我还尝试使用带有OneHotEncoder的StringIndexer生成向量,但我发现每个批次的向量长度都不同
- 由于我们有一种方法来分配用于流化Kmeans的随机中心,我必须在执行之前知道向量长度(对于我的测试数据和列,因为我得到的向量长度为800,所以我现在硬编码它以进行测试)
- 虽然为了训练模型,我们必须分别发送包含所有数据组合的训练向量,但现在我使用的训练向量和测试向量是一样的
import com.common.Configuration._
import com.twitter.bijection.Injection
import com.twitter.bijection.avro.GenericAvroCodecs
import kafka.serializer.{DefaultDecoder, StringDecoder}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.clustering.{KMeans, KMeansModel}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
import scala.util.Try;
object KafkaDataConsumer {
//localOrCluster=l streamBatchSeconds=2 locationToSaveStream=tmp3 isAvroTopic=text hiveTableName= modelName=streamkmeans hiveORFile=file locationToSaveModelData=tmp1 predictedDataLocation=tmp2 labelCol=salary nonStringCols= stringCols=workclass,age,education_num,hours_per_week csvOrHiveForTrain=data/mllib/adult.csv csvOrHiveForTest=data/mllib/adult.csv noOfNodes=10 noOfIter=10 locationToStorePmml=pmmlfolder runMLPipeline=true isOverwriteDataOk=no
// Setting the logs levels
setLogLevels(Level.WARN, Seq("spark", "org", "akka"))
def main(args: Array[String]) {
executeStreamdata(args)
}
def executeStreamdata(args: Array[String]) {
val namedArgs = getNamedArgs(args)
val localOrCluster = namedArgs("localOrCluster")
val streamBatchSeconds = namedArgs("streamBatchSeconds").toInt
val isAvroTopic = namedArgs("isAvroTopic").equalsIgnoreCase("avro")
val kafkaBrokerList = kafkaConfig.getString("kafkaBrokerList")
val kafkaTopicList =
if (isAvroTopic)
kafkaConfig.getString("TOPIC1_NAME")
else
kafkaConfig.getString("TOPIC2_NAME")
val SCHEMA =
if (isAvroTopic)
kafkaConfig.getString("TOPIC1_SCHEMA")
else
kafkaConfig.getString("TOPIC2_SCHEMA")
val labelCol = namedArgs("labelCol")
val nonStringCols = namedArgs("nonStringCols")
val stringCols = namedArgs("stringCols")
val noOfNodes = namedArgs("noOfNodes").toInt
val predictedDataLocation = namedArgs("predictedDataLocation")
val sc = getSparkContext(localOrCluster, "kafka stream application")
val ssc = new StreamingContext(sc, Seconds(streamBatchSeconds))
val sqlContext = new SQLContext(sc)
val topicsSet = kafkaTopicList.split(",").toSet
val kafkaParams = Map[String, String]("metadata.broker.list" -> kafkaBrokerList)
//Read Stream records either as AVRO or String
val dStreamRecords =
if (isAvroTopic) {
val parser = new Schema.Parser
val schema = parser.parse(SCHEMA)
val recordInjection: Injection[GenericRecord, Array[Byte]] = GenericAvroCodecs.toBinary(schema)
KafkaUtils.createDirectStream[String, Array[Byte], StringDecoder, DefaultDecoder](ssc, kafkaParams, topicsSet)
.map(message => recordInjection.invert(message._2).get.toString)
}
else {
KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet).map(_._2)
}
println("Executing Streaming KMeans")
//Pipeline for generating the label, features vector using StringTokenizer and HashFunction based on input label Columns, string and nonString Columns
val pipelineStagesWithAssembler = buildDataPrepPipeLine(sqlContext, labelCol, stringCols, nonStringCols, true)
val pipeline = new Pipeline().setStages(pipelineStagesWithAssembler)
var vectorsDFSchema:StructType = null
def createDFFromDStream(rdd: RDD[String]): DataFrame = {
if (isAvroTopic)
sqlContext.read.json(rdd)
else
sqlContext.createDataFrame(rdd.map(x => Row.fromSeq(x.split(",").toSeq)), createSchemaFrmStrDelValues(SCHEMA))
}
val vectorDataWithRowNum = dStreamRecords.transform(rdd=>{
//Create dataframe from the input data
val dataframe = createDFFromDStream(rdd)
//Get the pipeline model
val pipelineModel = pipeline.fit(dataframe)
//Get label, features vectors in the DataF
val vectorsDF = pipelineModel.transform(dataframe)
//get the row number for the dataframe
val vectorsDFWithRowNum = dfWithRowIndexUsingRDD(vectorsDF)
vectorsDFSchema = StructType(vectorsDFWithRowNum.schema.filter(!_.dataType.toString.contains("Vector")))
//Get row number, vectors , row data
val reqData = vectorsDFWithRowNum.map(row => {
(row.getAs[Long]("row_num"), row.getAs[org.apache.spark.mllib.linalg.Vector]("features"), row)
})
reqData
})
// get train data(vector) , test data (row number or index, vector), row data (row number or index, row data)
val (dStreamTrainVector, dStreamTestVector, rowData) = (vectorDataWithRowNum.map(_._2), vectorDataWithRowNum.map(data=>(data._1, data._2)), vectorDataWithRowNum.map(data=>(data._1, data._3)))
// Need to Fix the Random Centers hardcoded 800
val model = new StreamingKMeans()
.setK(noOfNodes)
.setDecayFactor(1.0)
.setRandomCenters(800, 0)
// Train the Kmeans model
model.trainOn(dStreamTrainVector)
// Predict vales from the model
val joinData = model.predictOnValues(dStreamTestVector).join(rowData)
joinData.foreachRDD(rdd=> {
val seqData = rdd.map(x=>{
val id = x._1
val clusterID = x._2._1
val rowData = x._2._2.toSeq.filter(!_.isInstanceOf[org.apache.spark.mllib.linalg.SparseVector])
Row.fromSeq(rowData :+ clusterID.toString)
})
val dfPredictions = sqlContext.createDataFrame(seqData, vectorsDFSchema.add("clusterID", StringType))
dfPredictions.printSchema()
dfPredictions.show()
dfPredictions.write.mode(SaveMode.Append).save(predictedDataLocation)
})
ssc.start
ssc.awaitTermination
}
private def buildOneHotPipeLine(colName:String, isStreamJob:Boolean = false):Array[PipelineStage] = {
if(isStreamJob){
val tokenizer = new org.apache.spark.ml.feature.Tokenizer()
.setInputCol(s"${colName}")
.setOutputCol(s"${colName}_token")
val hashingTF = new org.apache.spark.ml.feature.HashingTF()
.setInputCol(s"${colName}_token")
.setOutputCol(s"${colName}_hashFeature").setNumFeatures(200)
Array(tokenizer, hashingTF)
}else {
val stringIndexer = new StringIndexer()
.setInputCol(s"$colName")
.setOutputCol(s"${colName}_index")
val oneHotEncoder = new OneHotEncoder()
.setInputCol(s"${colName}")
.setOutputCol(s"${colName}_onehotindex")
Array(stringIndexer, oneHotEncoder)
}
}
def buildDataPrepPipeLine(sqlContext: SQLContext, lableCol: String, stringCols: String, nonStringCols: String, isStreamJob:Boolean = false):Array[PipelineStage] = {
var pipelineStagesforFeatures : Array[PipelineStage]= null
var assemblerInputCols :Array[String] = null
if(stringCols != null && !stringCols.isEmpty ){
val stringColsArray = stringCols.split(",")
if(pipelineStagesforFeatures != null) {
pipelineStagesforFeatures = pipelineStagesforFeatures ++ stringColsArray.map(columnName => buildOneHotPipeLine(columnName, isStreamJob)).reduce(_ ++ _)
}else{
pipelineStagesforFeatures = stringColsArray.map(columnName => buildOneHotPipeLine(columnName, isStreamJob)).reduce(_++_)
}
if(assemblerInputCols != null) {
assemblerInputCols = assemblerInputCols ++ stringColsArray.map(colName => if(isStreamJob) s"${colName}_hashFeature" else s"${colName}_onehotindex")
}else{
assemblerInputCols = stringColsArray.map(colName => if(isStreamJob) s"${colName}_hashFeature" else s"${colName}_onehotindex")
}
}
if(nonStringCols != null && !nonStringCols.isEmpty ){
val nonStringColsArray = nonStringCols.split(",")
if(assemblerInputCols != null) {
assemblerInputCols = assemblerInputCols ++ nonStringColsArray
}else{
assemblerInputCols = nonStringColsArray
}
}
// Combine all the features and make it a single Feature
val assembler = new VectorAssembler()
.setInputCols(assemblerInputCols)
.setOutputCol("features")
var labelIndexer : StringIndexer= null
if(lableCol != null && !lableCol.isEmpty){
labelIndexer = new StringIndexer()
labelIndexer.setInputCol(lableCol)
labelIndexer.setOutputCol("label")
}
val pipelineStagesWithAssembler =
if(stringCols != null && !stringCols.isEmpty &&
nonStringCols != null && !nonStringCols.isEmpty &&
lableCol != null && !lableCol.isEmpty) {
pipelineStagesforFeatures.toList ::: List(assembler,labelIndexer)
}else if( stringCols != null && !stringCols.isEmpty &&
lableCol != null && !lableCol.isEmpty){
pipelineStagesforFeatures.toList ::: List(assembler,labelIndexer)
} else if( nonStringCols != null && !nonStringCols.isEmpty &&
lableCol != null && !lableCol.isEmpty){
List(assembler,labelIndexer)
}else if( stringCols != null && !stringCols.isEmpty &&
nonStringCols != null && !nonStringCols.isEmpty){
pipelineStagesforFeatures.toList ::: List(assembler)
}else if( stringCols != null && !stringCols.isEmpty){
pipelineStagesforFeatures.toList ::: List(assembler)
}else{
List(assembler)
}
pipelineStagesWithAssembler.toArray
}
def dfWithRowIndexUsingRDD(df: DataFrame, offset: Int = 1, colName: String = "row_num", inFront: Boolean = true): DataFrame = {
df.sqlContext.createDataFrame(
df.rdd.zipWithIndex.map(ln =>
Row.fromSeq(
(if (inFront) Seq(ln._2 + offset) else Seq())
++ ln._1.toSeq ++
(if (inFront) Seq() else Seq(ln._2 + offset))
)
),
StructType(
(if (inFront) Array(StructField(colName, LongType, false)) else Array[StructField]())
++ df.schema.fields ++
(if (inFront) Array[StructField]() else Array(StructField(colName, LongType, false)))
)
)
}
def getSparkContext(runLocal: String, appName: String) = {
val sc: SparkContext = if (runLocal.equalsIgnoreCase("local") || runLocal.equalsIgnoreCase("l")) {
val sparkConfig = new SparkConf()
sparkConfig.set("spark.broadcast.compress", "false")
sparkConfig.set("spark.shuffle.compress", "false")
sparkConfig.set("spark.shuffle.spill.compress", "false")
new SparkContext("local[1]", appName, sparkConfig)
} else {
val sparkConfig = new SparkConf().setAppName(appName)
new SparkContext(sparkConfig)
}
sc.hadoopConfiguration.setBoolean("parquet.enable.summary-metadata", false)
sc
}
def createSchemaFrmStrDelValues(baseSchema: String): StructType = {
return StructType(baseSchema.split(",").map(f => StructField(f, StringType, true)))
}
def getNamedArgs(args: Array[String]): Map[String, String] = {
println("################### Input parameters are ############### " + args.toList)
args.filter(line => line.contains("=")) //take only named arguments
.map(x => {
val key = x.substring(0, x.indexOf("="))
val value = x.substring(x.indexOf("=") + 1)
(key, if (value == null || "".equalsIgnoreCase(value)) null else value)
}).toMap //convert to a map
}
def setLogLevels(level: Level, loggers: Seq[String]): Map[String, Level] = loggers.map(loggerName => {
val logger = Logger.getLogger(loggerName)
val prevLevel = logger.getLevel
logger.setLevel(level)
loggerName -> prevLevel
}).toMap
def pullDataFromCSVFile(sqlContext: SQLContext, isHeaderExist: Boolean, filePath: String, delimiter: String, csvSplit: String): DataFrame = {
var csvDataFrame: DataFrame = null
try {
if (isHeaderExist) {
csvDataFrame = sqlContext.read
.format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", "true")
.load(filePath)
} else {
if (csvSplit != null) {
val schema = createSchemaFrmStrDelValues(csvSplit)
csvDataFrame = sqlContext.read
.format("com.databricks.spark.csv")
.option("header", "false")
.option("delimiter", delimiter)
.option("inferSchema", "false")
.schema(schema)
.load(filePath)
}
}
} catch {
case ex: Exception => {
println("Unable to read the CSV file from the location " + filePath)
ex.printStackTrace()
throw ex
}
}
csvDataFrame
}
def executeBatch(args: Array[String]) {
val namedArgs = getNamedArgs(args)
val runLocal = namedArgs("clusterORLocal")
val sc: SparkContext = getSparkContext(runLocal, "Ml Pipeline")
val csvOrHiveForTrain = namedArgs("csvOrHiveForTrain")
val csvOrHiveForTest = namedArgs("csvOrHiveForTest")
val locationToSaveModelData = namedArgs("locationToSaveModelData")
val labelCol = namedArgs("labelCol")
val featureCols = namedArgs("featureCols").toString.split(",").map(_.toLowerCase)
val noOfNodes = namedArgs("noOfNodes").toInt
val noOfIter = namedArgs("noOfIter").toInt
val sqlContext = new SQLContext(sc);
val trainDF = pullDataFromCSVFile(sqlContext, true, csvOrHiveForTrain, null, null)
trainDF.printSchema()
val testDF = pullDataFromCSVFile(sqlContext, true, csvOrHiveForTest, null, null)
testDF.printSchema()
val rootLogger = Logger.getRootLogger()
rootLogger.setLevel(Level.ERROR)
var nonStringCols = ""
var stringCols = ""
for (field <- testDF.schema){
val fieldName = field.name.trim
val fieldDataType = field.dataType.typeName.trim
if(featureCols.contains(fieldName.toLowerCase())){
if(fieldDataType.equalsIgnoreCase("integer") || fieldDataType.equalsIgnoreCase("long") ||
fieldDataType.equalsIgnoreCase("DOUBLE") || fieldDataType.equalsIgnoreCase("FLOAT") ||
fieldDataType.equalsIgnoreCase("DECIMAL"))
nonStringCols +=fieldName+","
else if(fieldDataType.equalsIgnoreCase("string"))
stringCols +=fieldName+","
}
}
if(nonStringCols.isEmpty && stringCols.isEmpty){
throw new Exception("Check if Feature columns are empty")
}
val reqModel = Try {
KMeansModel.load(locationToSaveModelData)
}.getOrElse({
val kmeans = new KMeans().setK(noOfNodes).setMaxIter(noOfIter)
kmeans
})
//Build pipeline
val pipeline = new Pipeline().setStages(buildDataPrepPipeLine(sqlContext, labelCol, stringCols, nonStringCols) ++ Array(reqModel))
//Model generated and used to prepare data for model
val pipelineModel = pipeline.fit(trainDF)
//predict using model using the prepared data thru the pipeline
val testPredictions = pipelineModel.transform(testDF)
testPredictions.show()
}
}
导入com.common.Configuration_
导入com.twitter.bijection.Injection
导入com.twitter.bijection.avro.GenericAvroCodecs
导入kafka.serializer.{DefaultDecoder,StringDecoder}
导入org.apache.avro.Schema
导入org.apache.avro.generic.GenericRecord
导入org.apache.spark.mllib.clustering.StreamingKMeans
导入org.apache.spark.sql.{DataFrame,Row,SQLContext,SaveMode}
导入org.apache.spark.streaming.{Seconds,StreamingContext}
导入org.apache.spark.streaming.kafka.KafkaUtils
导入org.apache.log4j.{Level,Logger}
导入org.apache.spark.ml.feature.{HashingTF,Tokenizer}
导入org.apache.spark.ml.clustering.{KMeans,KMeansModel}
导入org.apache.spark.{SparkConf,SparkContext}
导入org.apache.spark.ml.feature.{OneHotEncoder、StringIndexer、VectorAssembler}
导入org.apache.spark.ml.{Pipeline,PipelineStage}
导入org.apache.spark.rdd.rdd
导入org.apache.spark.sql.types.{LongType,StringType,StructField,StructType}
导入scala.util.Try;
对象KafkaDataConsumer{
//localOrCluster=l streamBatchSeconds=2 locationToSaveStream=tmp3 isAvroTopic=text hiveTableName=modelName=streamkmeans hiveORFile=file locationToSaveModelData=tmp1 PredictedCatalLocation=tmp2 labelCol=salary nonStringCols=stringCols=workclass、age、education_num、hours_/周csvOrHiveForTrain=data/mllib/成人.csv csvOrHiveForTest=data/mllib/adult、 csv noOfNodes=10 noOfIter=10 locationToStorePmml=pmmlfolder runMLPipeline=true isoverwritedaaok=no
//设置日志级别
设置日志级别(Level.WARN,Seq(“spark”、“org”、“akka”))
def main(参数:数组[字符串]){
executeStreamdata(args)
}
def executeStreamdata(参数:数组[字符串]){
val namedArgs=getNamedArgs(args)
val localOrCluster=namedArgs(“localOrCluster”)
val streamBatchSeconds=namedArgs(“streamBatchSeconds”).toInt
val isAvroTopic=namedArgs(“isAvroTopic”)。等效信号案例(“avro”)
val kafkaBrokerList=kafkanconfig.getString(“kafkaBrokerList”)
瓦尔·卡夫卡通片=
如果(isAvroTopic)
Kafkanconfig.getString(“主题名称”)
其他的
Kafkanconfig.getString(“主题2_名称”)
val模式=
如果(isAvroTopic)
Kafkanconfig.getString(“主题1_模式”)
其他的
Kafkanconfig.getString(“主题2_模式”)
val labelCol=名称(“labelCol”)
val nonStringCols=namedArgs(“nonStringCols”)
val stringCols=名称代号(“stringCols”)
val noOfNodes=namedArgs(“noOfNodes”).toInt
val predictedDataLocation=namedArgs(“predictedDataLocation”)
val sc=getSparkContext(localOrCluster,“卡夫卡流应用程序”)
val ssc=新的StreamingContext(sc,秒(streamBatchSeconds))
val sqlContext=新的sqlContext(sc)
val topicsSet=kafkaTopicList.split(“,”).toSet
val kafkabarams=Map[String,String](“metadata.broker.list”->kafkaBrokerList)
//以AVRO或字符串形式读取流记录
val数据流记录=
如果(isAvroTopic){
val parser=newschema.parser
val schema=parser.parse(schema)
val recordInjection:Injection[GenericRecord,Array[Byte]]=GenericVRocodecs.toBinary(架构)
KafkaUtils.createDirectStream[String,Array[Byte],StringDecoder,DefaultDecoder](ssc,kafkaParams,TopicSet)
.map(message=>recordInjection.invert(message.\u 2.get.toString)
}
否则{
KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder](ssc,kafkaParams,TopicSet).map(u.\u 2)
}
println(“执行流式KMeans”)
//基于输入标签列、字符串和非字符串列,使用StringTokenizer和HashFunction生成标签、特征向量的管道
val pipelineStagesWithAssembler=buildDataPrepPipeLine(sqlContext、labelCol、stringCols、NonSringCols、true)
val pipeline=new pipeline().setStages(pipelineStagesWithAssembler)