Apache flink 如何过滤flink中大于某一点的值?
我有两条小溪。第一个是基于时间的流,我使用Apache flink 如何过滤flink中大于某一点的值?,apache-flink,Apache Flink,我有两条小溪。第一个是基于时间的流,我使用countTimeWindow接收前10个数据点来计算stat值。我手动使用变量cnt仅保留第一个窗口,并过滤剩余值,如下面的代码所示 然后,我想用这个值来过滤主流,以便得到大于我在窗口流中计算的stat值的值 然而,我不知道如何合并或计算这两个流来实现我的目标 我的场景是,如果我将第一个stat值转换为广播变量,然后将其提供给主流,以便能够根据广播变量中的stat值过滤传入的值 下面是我的代码 import com.sun.org.apache.xpa
countTimeWindow
接收前10个数据点来计算stat值。我手动使用变量cnt
仅保留第一个窗口,并过滤剩余值,如下面的代码所示
然后,我想用这个值来过滤主流,以便得到大于我在窗口流中计算的stat值的值
然而,我不知道如何合并或计算这两个流来实现我的目标
我的场景是,如果我将第一个stat值转换为广播变量,然后将其提供给主流,以便能够根据广播变量中的stat值过滤传入的值
下面是我的代码
import com.sun.org.apache.xpath.internal.operations.Bool;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.GlobalWindow;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer09;
import org.apache.flink.streaming.util.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.functions.windowing.*;
import org.apache.flink.util.Collector;
import scala.Int;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.concurrent.TimeUnit;
public class ReadFromKafka {
static int cnt = 0;
public static void main(String[] args) throws Exception{
// create execution environment
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", "localhost:9092");
properties.setProperty("group.id", "flink");
DataStream<String> stream = env
.addSource(new FlinkKafkaConsumer09<>("flinkStreaming11", new SimpleStringSchema(), properties));
env.enableCheckpointing(1000);
//Time based window stream
DataStream<String> process = stream.countWindowAll(10).process(new ProcessAllWindowFunction<String, Tuple2<Double, Integer>, GlobalWindow>() {
@Override
public void process(Context context, Iterable<String> iterable, Collector<Tuple2<Double, Integer>> collector) throws Exception {
Double sum = 0.0;
int n = 0;
List<Double> listDouble = new ArrayList<>();
for (String in : iterable) {
n++;
double d = Double.parseDouble(in);
sum += d;
listDouble.add(d);
}
cnt++;
Double[] sd = listDouble.toArray(new Double[listDouble.size()]);
double mean = sum / n;
double sdev = 0;
for (int i = 0; i < sd.length; ++i) {
sdev += ((sd[i] - mean) * (sd[i] - mean)) / (sd.length - 1);
}
double standardDeviation = Math.sqrt(sdev);
collector.collect(new Tuple2<Double, Integer>(mean + 3 * standardDeviation, cnt));
}
}).filter(new FilterFunction<Tuple2<Double, Integer>>() {
@Override
public boolean filter(Tuple2<Double, Integer> doubleIntegerTuple2) throws Exception {
Integer i1 = doubleIntegerTuple2.f1;
if (i1 > 1)
return false;
else
return true;
}
}).map(new RichMapFunction<Tuple2<Double, Integer>, String>() {
@Override
public String map(Tuple2<Double, Integer> doubleIntegerTuple2) throws Exception {
return String.valueOf(doubleIntegerTuple2.f0);
}
});
//I don't think that this is not a proper solution.
process.union(stream).filter(new FilterFunction<String>() {
@Override
public boolean filter(String s) throws Exception {
return false;
}
})
env.execute("InfluxDB Sink Example");
env.execute();
}
}
import com.sun.org.apache.xpath.internal.operations.Bool;
导入org.apache.flink.api.common.functions.FilterFunction;
导入org.apache.flink.api.common.functions.MapFunction;
导入org.apache.flink.api.common.functions.RichMapFunction;
导入org.apache.flink.api.java.tuple.Tuple2;
导入org.apache.flink.streaming.api.TimeCharacteristic;
导入org.apache.flink.streaming.api.datastream.datastream;
导入org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
导入org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
导入org.apache.flink.streaming.api.windowing.time.time;
导入org.apache.flink.streaming.api.windowing.windows.GlobalWindow;
导入org.apache.flink.streaming.api.windowing.windows.TimeWindow;
导入org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer09;
导入org.apache.flink.streaming.util.serialization.SimpleStringSchema;
导入org.apache.flink.streaming.api.functions.windowing.*;
导入org.apache.flink.util.Collector;
导入scala.Int;
导入java.text.simpleDataFormat;
导入java.util.*;
导入java.util.concurrent.TimeUnit;
公共类ReadFromKafka{
静态int cnt=0;
公共静态void main(字符串[]args)引发异常{
//创建执行环境
StreamExecutionEnvironment env=StreamExecutionEnvironment.getExecutionEnvironment();
属性=新属性();
setProperty(“bootstrap.servers”,“localhost:9092”);
properties.setProperty(“group.id”、“flink”);
数据流=环境
.addSource(新的FlinkKafkaConsumer09(“flinkStreaming11”,新的SimpleStringSchema(),属性));
环境启用检查点(1000);
//基于时间的窗口流
DataStream process=stream.countWindowAll(10.process)(新的ProcessAllWindowFunction(){
@凌驾
公共void进程(上下文上下文、Iterable、收集器)引发异常{
双和=0.0;
int n=0;
List listDouble=新的ArrayList();
for(字符串输入:iterable){
n++;
double d=double.parseDouble(in);
总和+=d;
添加(d);
}
cnt++;
Double[]sd=listDouble.toArray(新的Double[listDouble.size()]);
双平均值=总和/n;
双sdev=0;
对于(int i=0;i1)
返回false;
其他的
返回true;
}
}).map(新的RichMapFunction(){
@凌驾
公共字符串映射(Tuple2 doubleIntegerTuple2)引发异常{
返回字符串.valueOf(doubleIntegerTuple2.f0);
}
});
//我认为这不是一个合适的解决方案。
process.union(stream.filter)(新的FilterFunction(){
@凌驾
公共布尔筛选器(字符串s)引发异常{
返回false;
}
})
环境执行(“XDB接收器示例”);
execute();
}
}
首先,我认为您只有一条流,对吗?只有一个基于卡夫卡的双精度源(编码为字符串)
其次,如果前10个值确实永久定义了过滤限制,那么您可以将流运行到RichFlatMap函数中,在该函数中捕获前10个值以计算最大值,然后过滤所有后续值(仅输出值>=此限制)
请注意,通常您希望保存状态(10个初始值的数组,加上限制),以便可以从检查点/保存点重新启动工作流
相反,如果您不断地从最近的10个值重新计算您的限制,那么代码就稍微复杂一些,因为您有一个值队列,并且您需要在添加新值时对从队列中刷新的值进行过滤