Apache spark 时间序列模型在spark流数据中的应用
在我的应用程序中,我需要从kafka接收流式数据,并且需要对Spark中接收的流式数据应用timeseries模型 我能够从卡夫卡读取流媒体数据,但我不知道如何将timeseries模型应用于流媒体数据 任何人都可以向我推荐timeseries的工作原理和用例 数据集:Apache spark 时间序列模型在spark流数据中的应用,apache-spark,streaming,time-series,apache-kafka,spark-streaming,Apache Spark,Streaming,Time Series,Apache Kafka,Spark Streaming,在我的应用程序中,我需要从kafka接收流式数据,并且需要对Spark中接收的流式数据应用timeseries模型 我能够从卡夫卡读取流媒体数据,但我不知道如何将timeseries模型应用于流媒体数据 任何人都可以向我推荐timeseries的工作原理和用例 数据集: 725030:14732,2008,01,01,00,5.0,-3.9,1020.4,270,4.6,2,0.0,0.0 725030:14732,2008,01,01,01,5.0,-3.3,1020.6,290,4.1,2,
725030:14732,2008,01,01,00,5.0,-3.9,1020.4,270,4.6,2,0.0,0.0
725030:14732,2008,01,01,01,5.0,-3.3,1020.6,290,4.1,2,0.0,0.0
725030:14732,2008,01,01,02,5.0,-3.3,1020.0,310,3.1,2,0.0,0.0
725030:14732,2008,01,01,03,4.4,-2.8,1020.1,300,1.5,2,0.0,0.0
725030:14732,2008,01,01,04,3.3,-4.4,1020.5,240,2.6,0,0.0,0.0
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import kafka.serializer.StringDecoder;
import scala.Tuple2;
//KafkatoSparkStreaming Working Code
/*this code is converting KafkaToSparkstreaming into DataSet by useing SparkJava
* */
public class a {
public static void main(String arr[]) throws InterruptedException
{
SparkConf conf = new SparkConf();
conf.set("spark.app.name", "SparkReceiver"); //The name of application. This will appear in the UI and in log data.
//conf.set("spark.ui.port", "7077"); //Port for application's dashboard, which shows memory and workload data.
conf.set("dynamicAllocation.enabled","false"); //Which scales the number of executors registered with this application up and down based on the workload
//conf.set("spark.cassandra.connection.host", "localhost"); //Cassandra Host Adddress/IP
conf.set("spark.serializer","org.apache.spark.serializer.KryoSerializer"); //For serializing objects that will be sent over the network or need to be cached in serialized form.
conf.setMaster("local");
conf.set("spark.streaming.stopGracefullyOnShutdown", "true");
JavaSparkContext sc = new JavaSparkContext(conf);
// Create the context with 2 seconds batch size
JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(2000));
Map<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("zookeeper.connect", "localhost:2181"); //Make all kafka data for this cluster appear under a particular path.
kafkaParams.put("group.id", "testgroup"); //String that uniquely identifies the group of consumer processes to which this consumer belongs
kafkaParams.put("metadata.broker.list", "localhost:9092"); //Producer can find a one or more Brokers to determine the Leader for each topic.
kafkaParams.put("serializer.class", "kafka.serializer.StringEncoder"); //Serializer to use when preparing the message for transmission to the Broker.
kafkaParams.put("request.required.acks", "1"); //Producer to require an acknowledgement from the Broker that the message was received.
Set<String> topics = Collections.singleton("ny-2008.csv");
//Create an input DStream for Receiving data from socket
JavaPairInputDStream<String, String> directKafkaStream = KafkaUtils.createDirectStream(ssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
kafkaParams, topics);
//Create JavaDStream<String>
JavaDStream<String> msgDataStream = directKafkaStream.map(new Function<Tuple2<String, String>, String>() {
@Override
public String call(Tuple2<String, String> tuple2) {
return tuple2._2();
}
});
//Create JavaRDD<Row>
msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {
@Override
public void call(JavaRDD<String> rdd) {
JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
@Override
public Row call(String msg) {
Row row = RowFactory.create(msg);
return row;
}
});
//Create Schema
StructType schema = DataTypes.createStructType(new StructField[] {DataTypes.createStructField("Message", DataTypes.StringType, true)});
//Get Spark 2.0 session
SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
msgDataFrame.show();
msgDataFrame.createOrReplaceTempView("weatherTemporaryData");
msgDataFrame.select("280").show();
}
});
ssc.start();
ssc.awaitTermination();
}
}
class JavaSparkSessionSingleton {
private static transient SparkSession instance = null;
public static SparkSession getInstance(SparkConf sparkConf) {
if (instance == null) {
instance = SparkSession
.builder()
.config(sparkConf)
.getOrCreate();
}
return instance;
}
}
Sparkjava代码如下所示:
725030:14732,2008,01,01,00,5.0,-3.9,1020.4,270,4.6,2,0.0,0.0
725030:14732,2008,01,01,01,5.0,-3.3,1020.6,290,4.1,2,0.0,0.0
725030:14732,2008,01,01,02,5.0,-3.3,1020.0,310,3.1,2,0.0,0.0
725030:14732,2008,01,01,03,4.4,-2.8,1020.1,300,1.5,2,0.0,0.0
725030:14732,2008,01,01,04,3.3,-4.4,1020.5,240,2.6,0,0.0,0.0
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import kafka.serializer.StringDecoder;
import scala.Tuple2;
//KafkatoSparkStreaming Working Code
/*this code is converting KafkaToSparkstreaming into DataSet by useing SparkJava
* */
public class a {
public static void main(String arr[]) throws InterruptedException
{
SparkConf conf = new SparkConf();
conf.set("spark.app.name", "SparkReceiver"); //The name of application. This will appear in the UI and in log data.
//conf.set("spark.ui.port", "7077"); //Port for application's dashboard, which shows memory and workload data.
conf.set("dynamicAllocation.enabled","false"); //Which scales the number of executors registered with this application up and down based on the workload
//conf.set("spark.cassandra.connection.host", "localhost"); //Cassandra Host Adddress/IP
conf.set("spark.serializer","org.apache.spark.serializer.KryoSerializer"); //For serializing objects that will be sent over the network or need to be cached in serialized form.
conf.setMaster("local");
conf.set("spark.streaming.stopGracefullyOnShutdown", "true");
JavaSparkContext sc = new JavaSparkContext(conf);
// Create the context with 2 seconds batch size
JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(2000));
Map<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("zookeeper.connect", "localhost:2181"); //Make all kafka data for this cluster appear under a particular path.
kafkaParams.put("group.id", "testgroup"); //String that uniquely identifies the group of consumer processes to which this consumer belongs
kafkaParams.put("metadata.broker.list", "localhost:9092"); //Producer can find a one or more Brokers to determine the Leader for each topic.
kafkaParams.put("serializer.class", "kafka.serializer.StringEncoder"); //Serializer to use when preparing the message for transmission to the Broker.
kafkaParams.put("request.required.acks", "1"); //Producer to require an acknowledgement from the Broker that the message was received.
Set<String> topics = Collections.singleton("ny-2008.csv");
//Create an input DStream for Receiving data from socket
JavaPairInputDStream<String, String> directKafkaStream = KafkaUtils.createDirectStream(ssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
kafkaParams, topics);
//Create JavaDStream<String>
JavaDStream<String> msgDataStream = directKafkaStream.map(new Function<Tuple2<String, String>, String>() {
@Override
public String call(Tuple2<String, String> tuple2) {
return tuple2._2();
}
});
//Create JavaRDD<Row>
msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {
@Override
public void call(JavaRDD<String> rdd) {
JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
@Override
public Row call(String msg) {
Row row = RowFactory.create(msg);
return row;
}
});
//Create Schema
StructType schema = DataTypes.createStructType(new StructField[] {DataTypes.createStructField("Message", DataTypes.StringType, true)});
//Get Spark 2.0 session
SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
msgDataFrame.show();
msgDataFrame.createOrReplaceTempView("weatherTemporaryData");
msgDataFrame.select("280").show();
}
});
ssc.start();
ssc.awaitTermination();
}
}
class JavaSparkSessionSingleton {
private static transient SparkSession instance = null;
public static SparkSession getInstance(SparkConf sparkConf) {
if (instance == null) {
instance = SparkSession
.builder()
.config(sparkConf)
.getOrCreate();
}
return instance;
}
}
import java.util.Collections;
导入java.util.HashMap;
导入java.util.Map;
导入java.util.Set;
导入org.apache.spark.SparkConf;
导入org.apache.spark.api.java.JavaRDD;
导入org.apache.spark.api.java.JavaSparkContext;
导入org.apache.spark.api.java.function.function;
导入org.apache.spark.api.java.function.VoidFunction;
导入org.apache.spark.sql.Dataset;
导入org.apache.spark.sql.Row;
导入org.apache.spark.sql.RowFactory;
导入org.apache.spark.sql.SparkSession;
导入org.apache.spark.sql.types.DataTypes;
导入org.apache.spark.sql.types.StructField;
导入org.apache.spark.sql.types.StructType;
导入org.apache.spark.streaming.Duration;
导入org.apache.spark.streaming.api.java.JavaDStream;
导入org.apache.spark.streaming.api.java.JavaPairInputStream;
导入org.apache.spark.streaming.api.java.JavaStreamingContext;
导入org.apache.spark.streaming.kafka.KafkaUtils;
导入kafka.serializer.StringDecoder;
导入scala.Tuple2;
//KafkatoSparkStreaming工作代码
/*这段代码使用SparkJava将KafkaToSparkstreaming转换为DataSet
* */
公共a类{
公共静态void main(字符串arr[])引发InterruptedException
{
SparkConf conf=新的SparkConf();
conf.set(“spark.app.name”,“SparkReceiver”);//应用程序的名称。它将出现在UI和日志数据中。
//conf.set(“spark.ui.port”,“7077”);//应用程序仪表板的端口,显示内存和工作负载数据。
conf.set(“dynamicAllocation.enabled”、“false”);//它根据工作负载上下缩放在此应用程序中注册的执行器的数量
//conf.set(“spark.cassandra.connection.host”、“localhost”);//cassandra主机地址/IP
conf.set(“spark.serializer”、“org.apache.spark.serializer.KryoSerializer”);//用于序列化将通过网络发送或需要以序列化形式缓存的对象。
conf.setMaster(“本地”);
conf.set(“spark.streaming.stopGracefullyOnShutdown”、“true”);
JavaSparkContext sc=新的JavaSparkContext(conf);
//创建具有2秒批大小的上下文
JavaStreamingContext ssc=新的JavaStreamingContext(sc,新的持续时间(2000));
Map kafkaParams=新HashMap();
kafkaParams.put(“zookeeper.connect”,“localhost:2181”);//使此集群的所有kafka数据显示在特定路径下。
kafkaParams.put(“group.id”,“testgroup”);//唯一标识此使用者所属的使用者进程组的字符串
kafkaParams.put(“metadata.broker.list”,“localhost:9092”);//生产者可以找到一个或多个代理来确定每个主题的领导者。
kafkaParams.put(“serializer.class”、“kafka.serializer.StringEncoder”);//准备消息传输到代理时要使用的序列化程序。
kafkaParams.put(“request.required.acks”,“1”);//生产者要求代理确认已收到消息。
设置主题=Collections.singleton(“ny-2008.csv”);
//创建用于从套接字接收数据的输入数据流
JavaPairInputStream directKafkaStream=KafkaUtils.createDirectStream(ssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
卡夫卡帕拉(主题);
//创建JavaDStream
JavaDStream msgDataStream=directKafkaStream.map(新函数(){
@凌驾
公共字符串调用(Tuple2 Tuple2){
返回tuple2._2();
}
});
//创建JavaRDD
msgDataStream.foreachRDD(新的VoidFunction(){
@凌驾
公共无效调用(JavaRDD rdd){
JavaRDD rowRDD=rdd.map(新函数(){
@凌驾
公用行调用(字符串消息){
Row=RowFactory.create(msg);
返回行;
}
});
//创建模式
StructType schema=DataTypes.createStructType(新StructField[]{DataTypes.createStructField(“Message”,DataTypes.StringType,true)});
//获取Spark 2.0会话
SparkSessionSpark=JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
数据集msgDataFrame=spark.createDataFrame(rowRDD,schema);
msgDataFrame.show();
msgDataFrame.createOrReplaceTempView(“天气临时数据”);
msgDataFrame.select(“280”).show();
}
});
ssc.start();
ssc.终止();
}
}
类JavaSparkSessionSingleton{
私有静态瞬态SparkSession实例=null;
公共静态SparkSession getInstance(SparkConf SparkConf){
if(实例==null){
实例=SparkSession
.builder()
.config(sparkConf)
.getOrCreate();
}
返回实例;
}
}
从统计学的角度来看,timeseries模型使用时移窗口。例如,您将尝试从N个过去值中预测未来值。如果这就是您想要实现的,那么您应该看看窗口函数。Jacek在这方面写了一篇好文章:
简而言之,为了帮助您了解发生了什么,您必须创建一个WindowSpec实例来指定: