Rabbitmq 为什么这个ProcessWindowFunction总是计算每个传入元素,而不是一个窗口的所有元素?
我正在努力建立一个ApacheFlink流媒体作业,它可以计算非常简单的物联网数据。 它使用RabbitMQ中的(源),因此使用RMQSource。这很好,而且对数据的解析也很好 然而,对于这个解析的数据流(类型为Triplet'String,Double,Long'(SensorID,值'PM2,5',timestamp)),之后应用的函数似乎是奇怪的。Rabbitmq 为什么这个ProcessWindowFunction总是计算每个传入元素,而不是一个窗口的所有元素?,rabbitmq,apache-flink,flink-streaming,jmeter-5.0,flink-cep,Rabbitmq,Apache Flink,Flink Streaming,Jmeter 5.0,Flink Cep,我正在努力建立一个ApacheFlink流媒体作业,它可以计算非常简单的物联网数据。 它使用RabbitMQ中的(源),因此使用RMQSource。这很好,而且对数据的解析也很好 然而,对于这个解析的数据流(类型为Triplet'String,Double,Long'(SensorID,值'PM2,5',timestamp)),之后应用的函数似乎是奇怪的。 首先,我想在传感器ID上设置流的键。 其次,我想创建一个窗口,包含每10秒或15秒根据ID键入的所有元素。 第三,应该在此窗口上执行一个非常
首先,我想在传感器ID上设置流的键。
其次,我想创建一个窗口,包含每10秒或15秒根据ID键入的所有元素。
第三,应该在此窗口上执行一个非常基本的ProcessWindowFunction,它只计算该窗口中的元素。=>基本上,如中的示例。
最后,ProcessWindowFunction的输出应打印到Std.Out 您可以看到下面的相关部分。 我使用JMeter和MQTT以及KafkaMeter插件来发送测试数据,一次发送大约50个请求,然后看看会发生什么 当我发送10个请求时,结果如下所示:
nope :(
nope :(
nope :(
nope :(
nope :(
nope :(
nope :(
nope :(
nope :(
nope :(
对于我的逻辑来说,这意味着ProcessWindowFunction为每个值计算,而不是为一个窗口计算一次
我现在的问题是:
.window(TumblingProcessingTimeWindows.of(Time.seconds(10))
具有
final StreamExecutionEnvironment env=StreamExecutionEnvironment.getExecutionEnvironment();
环境setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime)代码>启用->希望它能工作,但它没有
extractedDataStream
.keyBy(t -> t.getValue0()) // keyed by sensor IDs
//.timeWindow(Time.seconds(10))
.window(TumblingProcessingTimeWindows.of(Time.seconds(10)))
.process(new DetectTooHighAirPollution())
.print();
// execute program
env.execute("MQTT Detection StreamingJob");
}
public static class DetectTooHighAirPollution
extends ProcessWindowFunction<Triplet<String, Double, Long>, String, String, TimeWindow> {
@Override
public void process(String key, Context context, Iterable<Triplet<String, Double, Long>> input, Collector<String> out) throws IOException {
long count = 0;
for (Triplet<String, Double, Long> i : input) {
count++;
}
if (count > 1) {
out.collect("yap :D!: " + count);
} else {
out.collect("nope :(");
}
}
}
}
}
extractedDataStream
.keyBy(t->t.getValue0())//由传感器ID设置密钥
//.时间窗口(时间.秒(10))
.window(TumblingProcessingTimeWindows.of(Time.seconds(10)))
.过程(新检测器高污染()
.print();
//执行程序
环境执行(“MQTT检测流化作业”);
}
公共静态类检测器高污染
扩展ProcessWindowFunction{
@凌驾
公共void进程(字符串键、上下文、Iterable输入、收集器输出)抛出IOException{
长计数=0;
用于(三元组i:输入){
计数++;
}
如果(计数>1){
out.collect(“yap:D!:”+count);
}否则{
输出。收集(“否:(”);
}
}
}
}
}
为确保完整性,代码的其余部分将执行它应该执行的操作:
PS:我正在以JSON对象的形式发送带有有效负载的MQTT消息,我现在“手动”解析它
PPS:已删除配置详细信息
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.streaming.connectors.rabbitmq.RMQSource;
import org.apache.flink.streaming.connectors.rabbitmq.common.RMQConnectionConfig;
import org.apache.flink.util.Collector;
import org.javatuples.Triplet;
import java.io.IOException;
public class StreamingJob {
public static void main(String[] args) throws Exception {
// set up the streaming execution environment
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime);
//env.setParallelism(1);
// Set up a configuration for the RabbitMQ Source
final RMQConnectionConfig connectionConfig = new RMQConnectionConfig.Builder()
.setHost("")
.setPort()
.setUserName("")
.setPassword("")
.setVirtualHost("")
.build();
// Initiating a Data Stream from RabbitMQ
final DataStream<String> RMQstream = env
.addSource(new RMQSource<String>(
connectionConfig, // config for the RabbitMQ connection
"", // name of the RabbitMQ queue to consume
false, // use correlation ids; can be false if only at-least-once is required
new SimpleStringSchema())) // deserialization schema to turn messages into Java objects
.setParallelism(1); // parallel Source
//Extraction of values of the Data Stream
final DataStream<Triplet<String, Double, Long>> extractedDataStream = RMQstream.map(
new RichMapFunction<String, Triplet<String, Double, Long>>() {
@Override
public Triplet<String, Double, Long> map(String s) throws Exception {
// Extract the payload of the message
String[] input = s.split(",");
// Extract the sensor ID
String sensorID = input[1];
String unformattedID = sensorID.split(":")[1];
String id = unformattedID.replaceAll(" ", "");
// Extract longitude
String sensorLONG = input[2];
String unformattedLONGTD = sensorLONG.split(":")[1];
String longtd = unformattedLONGTD.replaceAll(" ", "");
// Extract latitude
String sensorLAT = input[3];
String unformattedLATD = sensorLAT.split(":")[1];
String latd = unformattedLATD.replaceAll(" ", "");
// Extract the particulate matter
String sensorPM2 = input[6];
String unformattedPM2 = sensorPM2.split(":")[1];
String pm2String = unformattedPM2.replaceAll("[ }]+", "");
double pm2 = Double.valueOf(pm2String).doubleValue();
long ts = System.currentTimeMillis();
Triplet<String, Double, Long> sensorData = Triplet.with(id, pm2, ts);
return sensorData;
}
}
);
import org.apache.flink.api.common.functions.RichMapFunction;
导入org.apache.flink.api.common.serialization.SimpleStringSchema;
导入org.apache.flink.streaming.api.TimeCharacteristic;
导入org.apache.flink.streaming.api.datastream.datastream;
导入org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
导入org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
导入org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
导入org.apache.flink.streaming.api.windowing.time.time;
导入org.apache.flink.streaming.api.windowing.windows.TimeWindow;
导入org.apache.flink.streaming.connectors.rabbitmq.RMQSource;
导入org.apache.flink.streaming.connectors.rabbitmq.common.RMQConnectionConfig;
导入org.apache.flink.util.Collector;
导入org.javatuples.Triplet;
导入java.io.IOException;
公共类精简作业{
公共静态void main(字符串[]args)引发异常{
//设置流执行环境
最终StreamExecutionEnvironment env=StreamExecutionEnvironment.getExecutionEnvironment();
环境setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime);
//环境(一);
//设置RabbitMQ源的配置
最终RMQConnectionConfig connectionConfig=new RMQConnectionConfig.Builder()
.setHost(“”)
.setPort()
.setUserName(“”)
.setPassword(“”)
.setVirtualHost(“”)
.build();
//从RabbitMQ启动数据流
最终数据流RMQstream=env
.addSource(新的RMQSource)(
connectionConfig,//RabbitMQ连接的配置
“”,//要使用的RabbitMQ队列的名称
false,//使用相关ID;如果至少需要一次,则可以为false
new SimpleStringSchema())//将消息转换为Java对象的反序列化模式
.setParallelism(1);//并行源
//数据流值的提取
最终数据流提取数据流=RMQstream.map(
新的RichMapFunction(){
@凌驾
公共三元组映射(字符串s)引发异常{
//提取消息的有效负载
字符串[]输入=s.split(“,”);
//提取传感器ID
字符串sensorID=输入[1];
字符串未格式化
...
extractedDataStream
//.filter(t -> t.getValue1() > 30) //This is just a use-case specific => 71/100 sensor requests have a higher value than 30.
.windowAll(TumblingProcessingTimeWindows.of(Time.seconds(15)))
.process(new DetectTooHighAirPollution())
.print();
...
public static class DetectTooHighAirPollution
extends ProcessAllWindowFunction<Triplet<String, Double, Long>, String, TimeWindow> {
@Override
public void process(Context context, Iterable<Triplet<String, Double, Long>> input, Collector<String> out) throws IOException {
long count = 0;
for (Triplet<String, Double, Long> i : input) {
count++;
}
if (count >= 10) {
out.collect(count + " Sensors, report a too high concentration of PM2!");
} else {
out.collect("Upps something went wrong :/");
}
}
}