Java 如何从KAFKA转换JSON以将其传递给Spark';s机器学习算法
我正在尝试使用Java学习spark和spark流媒体。以及开发物联网应用程序。 我有一个KAFKA服务器,它接受JSON数据,并且我能够使用SQLContext和foreach函数解析它 数据格式如下:Java 如何从KAFKA转换JSON以将其传递给Spark';s机器学习算法,java,json,apache-spark,apache-kafka,apache-spark-mllib,Java,Json,Apache Spark,Apache Kafka,Apache Spark Mllib,我正在尝试使用Java学习spark和spark流媒体。以及开发物联网应用程序。 我有一个KAFKA服务器,它接受JSON数据,并且我能够使用SQLContext和foreach函数解析它 数据格式如下: [{"t":1481368346000,"sensors":[{"s":"s1","d":"+149.625"},{"s":"s2","d":"+23.062"},{"s":"s3","d":"+16.375"},{"s":"s4","d":"+235.937"},{"s":"s5","d":
[{"t":1481368346000,"sensors":[{"s":"s1","d":"+149.625"},{"s":"s2","d":"+23.062"},{"s":"s3","d":"+16.375"},{"s":"s4","d":"+235.937"},{"s":"s5","d":"+271.437"},{"s":"s6","d":"+265.937"},{"s":"s7","d":"+295.562"},{"s":"s8","d":"+301.687"}]}]
JavaDStream<LabeledPoint> forML = json.map(new Function<String, LabeledPoint>() {
@Override
public LabeledPoint call(String jsonRecord) throws Exception {
// TODO Auto-generated method stub
System.out.println("\n\n\n here is JSON in"+ jsonRecord);
LabeledPoint returnObj = null;
if(!jsonRecord.isEmpty()){
Dataset<Row> timestamp = sqlContext.read().json(jsonRecord).select("t");
timestamp.printSchema();
timestamp.show(false);
Dataset<Row> data = sqlContext.read().json(jsonRecord).select("sensors");
data.printSchema();
data.show(false);
//DF in table
Dataset<Row> df = data.select(org.apache.spark.sql.functions.explode(org.apache.spark.sql.functions.col("sensors")))
.toDF("sensors").select("sensors.s","sensors.d").where("sensors.s = 's1'");
Row firstRow = df.head();
String valueOfFirstSensor = firstRow.getString(1);
System.out.println("---------valueOfFirstSensor --------"+ valueOfFirstSensor);
double[] values = new double[1];
values[0] = firstRow.getDouble(0);
returnObj = new LabeledPoint(timestamp.head().getDouble(0), Vectors.dense(values));
df.show(false);
}
return returnObj;
}
}).cache();
model.trainOn(forML);
在此,t是每个数据流的时间戳
传感器是传感器数据的数组,s是每个传感器的名称,d包含一个数据
到目前为止,我所做的是
JavaPairInputDStream<String, String> directKafkaStream =
KafkaUtils.createDirectStream(ssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
kafkaParams,
topics);
SQLContext sqlContext = spark.sqlContext();
StreamingLinearRegressionWithSGD model = new StreamingLinearRegressionWithSGD().setInitialWeights(Vectors.zeros(2));
JavaDStream<String> json = directKafkaStream.map(new Function<Tuple2<String,String>, String>() {
public String call(Tuple2<String,String> message) throws Exception {
return message._2();
};
});
json.print();
json.foreachRDD(new VoidFunction<JavaRDD<String>>() {
@Override
public void call(JavaRDD<String> jsonRecord) throws Exception {
System.out.println("JSON Record ---- "+jsonRecord);
if(!jsonRecord.isEmpty()){
Dataset<Row> timestamp = sqlContext.read().json(jsonRecord).select("t");
timestamp.printSchema();
timestamp.show(false);
Dataset<Row> data = sqlContext.read().json(jsonRecord).select("sensors");
data.printSchema();
data.show(false);
//DF in table
Dataset<Row> df = data.select(org.apache.spark.sql.functions.explode(org.apache.spark.sql.functions.col("sensors")))
.toDF("sensors").select("sensors.s","sensors.d").where("sensors.s = 's1'");
Row firstRow = df.head();
String valueOfFirstSensor = firstRow.getString(1);
System.out.println("---------valueOfFirstSensor --------"+ valueOfFirstSensor);
double[] values = new double[1];
values[0] = firstRow.getDouble(0);
new LabeledPoint(timestamp.head().getDouble(0), Vectors.dense(values));
df.show(false);
}
}
});
ssc.start();
ssc.awaitTermination();
JavaPairInputStream directKafkaStream=
KafkaUtils.createDirectStream(ssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
卡夫卡帕拉姆斯,
专题);
SQLContext SQLContext=spark.SQLContext();
StreamingLinearRegressionWithSGD model=新的StreamingLinearRegressionWithSGD()。设置初始权重(Vectors.zero(2));
JavaDStream json=directKafkaStream.map(新函数(){
公共字符串调用(Tuple2消息)引发异常{
返回消息。_2();
};
});
json.print();
foreachRDD(新的VoidFunction(){
@凌驾
公共void调用(JavaRDD jsonRecord)引发异常{
System.out.println(“JSON记录----”+jsonRecord);
如果(!jsonRecord.isEmpty()){
Dataset timestamp=sqlContext.read().json(jsonRecord);
timestamp.printSchema();
时间戳显示(假);
Dataset data=sqlContext.read().json(jsonRecord)。选择(“传感器”);
data.printSchema();
数据显示(假);
//表中的DF
Dataset df=data.select(org.apache.spark.sql.functions.explode(org.apache.spark.sql.functions.col(“传感器”))
.toDF(“传感器”)。选择(“sensors.s”、“sensors.d”)。其中(“sensors.s='s1'”;
Row firstRow=df.head();
字符串值OFFIRSTSENSOR=firstRow.getString(1);
System.out.println(“------------valueOfFirstSensor------------”+valueOfFirstSensor);
double[]值=新的double[1];
值[0]=firstRow.getDouble(0);
新的标签点(timestamp.head().getDouble(0),Vectors.dense(value));
df.show(假);
}
}
});
ssc.start();
ssc.终止();
我想做的是,将json(JavaDStream)转换成一个数据结构,该数据结构通过GD模型流化LinearRegressions
当我尝试使用sparks的map函数将json流映射到JavaDStream时,如下所示:
[{"t":1481368346000,"sensors":[{"s":"s1","d":"+149.625"},{"s":"s2","d":"+23.062"},{"s":"s3","d":"+16.375"},{"s":"s4","d":"+235.937"},{"s":"s5","d":"+271.437"},{"s":"s6","d":"+265.937"},{"s":"s7","d":"+295.562"},{"s":"s8","d":"+301.687"}]}]
JavaDStream<LabeledPoint> forML = json.map(new Function<String, LabeledPoint>() {
@Override
public LabeledPoint call(String jsonRecord) throws Exception {
// TODO Auto-generated method stub
System.out.println("\n\n\n here is JSON in"+ jsonRecord);
LabeledPoint returnObj = null;
if(!jsonRecord.isEmpty()){
Dataset<Row> timestamp = sqlContext.read().json(jsonRecord).select("t");
timestamp.printSchema();
timestamp.show(false);
Dataset<Row> data = sqlContext.read().json(jsonRecord).select("sensors");
data.printSchema();
data.show(false);
//DF in table
Dataset<Row> df = data.select(org.apache.spark.sql.functions.explode(org.apache.spark.sql.functions.col("sensors")))
.toDF("sensors").select("sensors.s","sensors.d").where("sensors.s = 's1'");
Row firstRow = df.head();
String valueOfFirstSensor = firstRow.getString(1);
System.out.println("---------valueOfFirstSensor --------"+ valueOfFirstSensor);
double[] values = new double[1];
values[0] = firstRow.getDouble(0);
returnObj = new LabeledPoint(timestamp.head().getDouble(0), Vectors.dense(values));
df.show(false);
}
return returnObj;
}
}).cache();
model.trainOn(forML);
JavaDStream forML=json.map(新函数(){
@凌驾
公共标签点调用(字符串jsonRecord)引发异常{
//TODO自动生成的方法存根
System.out.println(“\n\n\n这里是“+jsonRecord”中的JSON);
LabeledPoint returnObj=null;
如果(!jsonRecord.isEmpty()){
Dataset timestamp=sqlContext.read().json(jsonRecord);
timestamp.printSchema();
时间戳显示(假);
Dataset data=sqlContext.read().json(jsonRecord)。选择(“传感器”);
data.printSchema();
数据显示(假);
//表中的DF
Dataset df=data.select(org.apache.spark.sql.functions.explode(org.apache.spark.sql.functions.col(“传感器”))
.toDF(“传感器”)。选择(“sensors.s”、“sensors.d”)。其中(“sensors.s='s1'”;
Row firstRow=df.head();
字符串值OFFIRSTSENSOR=firstRow.getString(1);
System.out.println(“------------valueOfFirstSensor------------”+valueOfFirstSensor);
double[]值=新的double[1];
值[0]=firstRow.getDouble(0);
returnObj=新的标签点(timestamp.head().getDouble(0),Vectors.dense(value));
df.show(假);
}
返回OBJ;
}
}).cache();
列车模型(forML);
并调用model.trainOn,它会在
Dataset<Row> timestamp = sqlContext.read().json(jsonRecord).select("t");
Dataset timestamp=sqlContext.read().json(jsonRecord)。选择(“t”);
现在我要问的问题是