Apache flink Flink检查点的大小超过20GB,检查点时间超过1分钟
首先,也是最重要的:Apache flink Flink检查点的大小超过20GB,检查点时间超过1分钟,apache-flink,amazon-kinesis,checkpoint,amazon-kinesis-analytics,Apache Flink,Amazon Kinesis,Checkpoint,Amazon Kinesis Analytics,首先,也是最重要的: 我对Flink有点陌生(了解原理,能够创建我需要的任何基本流媒体作业) 我使用Kinesis Analytics来运行我的Flink作业,默认情况下,它使用1分钟间隔的增量检查点 Flink作业正在使用FlinkKinesisConsumer和自定义反序列化器从Kinesis流读取事件(将字节反序列化为一个简单的Java对象,该对象在整个作业中使用) 我想归档的只是计算过去24小时内实体ID/FOO和实体ID/BAR的事件数量。重要的是,这个计数要尽可能准确,这就是为什
- 我对Flink有点陌生(了解原理,能够创建我需要的任何基本流媒体作业)
- 我使用Kinesis Analytics来运行我的Flink作业,默认情况下,它使用1分钟间隔的增量检查点
- Flink作业正在使用FlinkKinesisConsumer和自定义反序列化器从Kinesis流读取事件(将字节反序列化为一个简单的Java对象,该对象在整个作业中使用)
SourceFunction源=
新的FlinkKinesisConsumer(“输入运动流”,新的MyJsonDeserializationSchema(),运动消费配置);
输入事件simple java pojo将在Flink操作符中使用:
public class Event implements Serializable {
public String entityId;
public String entityType;
public String entityName;
public long eventTimestamp = System.currentTimeMillis();
}
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
DataStream<Event> eventsStream = kinesis
.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<Event>(Time.seconds(30)) {
@Override
public long extractTimestamp(Event event) {
return event.eventTimestamp;
}
})
DataStream<Event> fooStream = eventsStream
.filter(new FilterFunction<Event>() {
@Override
public boolean filter(Event event) throws Exception {
return "foo".equalsIgnoreCase(event.entityType);
}
})
DataStream<Event> barStream = eventsStream
.filter(new FilterFunction<Event>() {
@Override
public boolean filter(Event event) throws Exception {
return "bar".equalsIgnoreCase(event.entityType);
}
})
StreamTableEnvironment tEnv = StreamTableEnvironment.create(env);
Table fooTable = tEnv.fromDataStream("fooStream, entityId, entityName, entityType, eventTimestame.rowtime");
tEnv.registerTable("Foo", fooTable);
Table barTable = tEnv.fromDataStream("barStream, entityId, entityName, entityType, eventTimestame.rowtime");
tEnv.registerTable("Bar", barTable);
Table slidingFooCountTable = fooTable
.window(Slide.over("24.hour").every("5.minute").on("eventTimestamp").as("minuteWindow"))
.groupBy("entityId, entityName, minuteWindow")
.select("concat(concat(entityId,'_'), entityName) as slidingFooId, entityid as slidingFooEntityid, entityName as slidingFooEntityName, entityType.count as slidingFooCount, minuteWindow.rowtime as slidingFooMinute");
Table slidingBarCountTable = barTable
.window(Slide.over("24.hout").every("5.minute").on("eventTimestamp").as("minuteWindow"))
.groupBy("entityId, entityName, minuteWindow")
.select("concat(concat(entityId,'_'), entityName) as slidingBarId, entityid as slidingBarEntityid, entityName as slidingBarEntityName, entityType.count as slidingBarCount, minuteWindow.rowtime as slidingBarMinute");
Table tumblingFooCountTable = fooTable
.window(Tumble.over(tumblingWindowTime).on("eventTimestamp").as("minuteWindow"))
.groupBy("entityid, entityName, minuteWindow")
.select("concat(concat(entityName,'_'), entityName) as tumblingFooId, entityId as tumblingFooEntityId, entityNamae as tumblingFooEntityName, entityType.count as tumblingFooCount, minuteWindow.rowtime as tumblingFooMinute");
Table tumblingBarCountTable = barTable
.window(Tumble.over(tumblingWindowTime).on("eventTimestamp").as("minuteWindow"))
.groupBy("entityid, entityName, minuteWindow")
.select("concat(concat(entityName,'_'), entityName) as tumblingBarId, entityId as tumblingBarEntityId, entityNamae as tumblingBarEntityName, entityType.count as tumblingBarCount, minuteWindow.rowtime as tumblingBarMinute");
Table aggregatedTable = slidingFooCountTable
.leftOuterJoin(slidingBarCountTable, "slidingFooId = slidingBarId && slidingFooMinute = slidingBarMinute")
.leftOuterJoin(tumblingFooCountTable, "slidingFooId = tumblingBarId && slidingFooMinute = tumblingBarMinute")
.leftOuterJoin(tumblingFooCountTable, "slidingFooId = tumblingFooId && slidingFooMinute = tumblingFooMinute")
.select("slidingFooMinute as timestamp, slidingFooCreativeId as entityId, slidingFooEntityName as entityName, slidingFooCount, slidingBarCount, tumblingFooCount, tumblingBarCount");
DataStream<Result> result = tEnv.toAppendStream(aggregatedTable, Result.class);
result.addSink(sink); // write to an output stream to be picked up by a lambda function
公共类事件实现可序列化{
公共字符串entityId;
公共字符串entityType;
公共字符串entityName;
public long eventTimestamp=System.currentTimeMillis();
}
环境setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
数据流事件流=运动
.assignTimestampsAndWatermarks(新的BoundedAutofordernessTimestampExtractor(时间.秒(30)){
@凌驾
公共长提取时间戳(事件){
返回event.eventTimestamp;
}
})
DataStream fooStream=eventsStream
.filter(新的FilterFunction(){
@凌驾
公共布尔筛选器(事件)引发异常{
返回“foo.equalsIgnoreCase(event.entityType)”;
}
})
DataStream barStream=eventsStream
.filter(新的FilterFunction(){
@凌驾
公共布尔筛选器(事件)引发异常{
返回“bar”.equalsIgnoreCase(event.entityType);
}
})
StreamTableEnvironment tEnv=StreamTableEnvironment.create(env);
Table fooTable=tEnv.fromDataStream(“footstream,entityId,entityName,entityType,eventTimestame.rowtime”);
十、登记表(“Foo”,fooTable);
表barTable=tEnv.fromDataStream(“barStream,entityId,entityName,entityType,eventTimestame.rowtime”);
十、登记表(“巴”,巴表);
表格滑动FooCountTable=可移动
.window(在(“事件时间戳”)上每(“5分钟”)滑动一次(“24小时”)。作为(“分钟窗口”))
.groupBy(“entityId、entityName、分钟窗口”)
.选择(“concat(concat(entityId,'',entityName)作为slidingFooId,entityId作为slidingFooEntityid,entityName作为slidingFooEntityName,entityType.count作为slidingFooCount,minuteWindow.rowtime作为slidingFooMinute”);
Table slidengbarcounttable=barTable
.窗口(每(“5分钟”)在(“事件时间戳”)上滑动一次(“24小时”)。作为(“分钟窗口”))
.groupBy(“entityId、entityName、分钟窗口”)
.选择(“concat(concat(entityId,“,”),entityName)作为slidingBarId,entityId作为slidingBarEntityid,entityName作为slidingBarEntityName,entityType.count作为slidingBarCount,minuteWindow.rowtime作为slidingBarMinute”);
表tumblingFooCountTable=可移动
.window(翻滚.over(翻滚窗口时间).on(“事件时间戳”).as(“分钟窗口”))
.groupBy(“entityid、entityName、分钟窗口”)
.选择(“concat(concat(entityName,简称“”),entityName)作为tumblingFooId,entityId作为tumblingFooEntityId,entityNamae作为tumblingFooEntityName,entityType.count作为tumblingFooCount,minuteWindow.rowtime作为tumblingFooMinute”);
表tumblingBarCountTable=barTable
.window(翻滚.over(翻滚窗口时间).on(“事件时间戳”).as(“分钟窗口”))
.groupBy(“entityid、entityName、分钟窗口”)
.选择(“concat(concat(entityName,简称“'),entityName)作为tumblingBarId,entityId作为tumblingBarEntityId,entityNamae作为tumblingBarEntityName,entityType.count作为tumblingBarCount,minuteWindow.rowtime作为tumblingBarMinute”);
Table aggregatedTable=slidingFooCountTable
.leftOuterJoin(slidingBarCountTable,“slidingFooId=slidingBarId&&slidingFooMinute=slidingBarMinute”)
.leftOuterJoin(tumblingFooCountTable,“slidingFooId=tumblingBarId&&SlidingFoominMinute=tumblingBarMinute”)
.leftOuterJoin(tumblingFooCountTable,“slidingFooId=tumblingFooId&&slidingFooMinute=tumblingFooMinute”)
.选择(“slidingFooMinute作为时间戳,slidingFooCreativeId作为entityId,slidingFooEntityName作为entityName,slidingFooCount,slidingBarCount,tumblingFooCount,tumblingBarCount”);
DataStream result=tEnv.toAppendStream(可聚合,result.class);
结果。addSink(sink);//写入要由接收的输出流
public class Event implements Serializable {
public String entityId;
public String entityType;
public String entityName;
public long eventTimestamp = System.currentTimeMillis();
}
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
DataStream<Event> eventsStream = kinesis
.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<Event>(Time.seconds(30)) {
@Override
public long extractTimestamp(Event event) {
return event.eventTimestamp;
}
})
DataStream<Event> fooStream = eventsStream
.filter(new FilterFunction<Event>() {
@Override
public boolean filter(Event event) throws Exception {
return "foo".equalsIgnoreCase(event.entityType);
}
})
DataStream<Event> barStream = eventsStream
.filter(new FilterFunction<Event>() {
@Override
public boolean filter(Event event) throws Exception {
return "bar".equalsIgnoreCase(event.entityType);
}
})
StreamTableEnvironment tEnv = StreamTableEnvironment.create(env);
Table fooTable = tEnv.fromDataStream("fooStream, entityId, entityName, entityType, eventTimestame.rowtime");
tEnv.registerTable("Foo", fooTable);
Table barTable = tEnv.fromDataStream("barStream, entityId, entityName, entityType, eventTimestame.rowtime");
tEnv.registerTable("Bar", barTable);
Table slidingFooCountTable = fooTable
.window(Slide.over("24.hour").every("5.minute").on("eventTimestamp").as("minuteWindow"))
.groupBy("entityId, entityName, minuteWindow")
.select("concat(concat(entityId,'_'), entityName) as slidingFooId, entityid as slidingFooEntityid, entityName as slidingFooEntityName, entityType.count as slidingFooCount, minuteWindow.rowtime as slidingFooMinute");
Table slidingBarCountTable = barTable
.window(Slide.over("24.hout").every("5.minute").on("eventTimestamp").as("minuteWindow"))
.groupBy("entityId, entityName, minuteWindow")
.select("concat(concat(entityId,'_'), entityName) as slidingBarId, entityid as slidingBarEntityid, entityName as slidingBarEntityName, entityType.count as slidingBarCount, minuteWindow.rowtime as slidingBarMinute");
Table tumblingFooCountTable = fooTable
.window(Tumble.over(tumblingWindowTime).on("eventTimestamp").as("minuteWindow"))
.groupBy("entityid, entityName, minuteWindow")
.select("concat(concat(entityName,'_'), entityName) as tumblingFooId, entityId as tumblingFooEntityId, entityNamae as tumblingFooEntityName, entityType.count as tumblingFooCount, minuteWindow.rowtime as tumblingFooMinute");
Table tumblingBarCountTable = barTable
.window(Tumble.over(tumblingWindowTime).on("eventTimestamp").as("minuteWindow"))
.groupBy("entityid, entityName, minuteWindow")
.select("concat(concat(entityName,'_'), entityName) as tumblingBarId, entityId as tumblingBarEntityId, entityNamae as tumblingBarEntityName, entityType.count as tumblingBarCount, minuteWindow.rowtime as tumblingBarMinute");
Table aggregatedTable = slidingFooCountTable
.leftOuterJoin(slidingBarCountTable, "slidingFooId = slidingBarId && slidingFooMinute = slidingBarMinute")
.leftOuterJoin(tumblingFooCountTable, "slidingFooId = tumblingBarId && slidingFooMinute = tumblingBarMinute")
.leftOuterJoin(tumblingFooCountTable, "slidingFooId = tumblingFooId && slidingFooMinute = tumblingFooMinute")
.select("slidingFooMinute as timestamp, slidingFooCreativeId as entityId, slidingFooEntityName as entityName, slidingFooCount, slidingBarCount, tumblingFooCount, tumblingBarCount");
DataStream<Result> result = tEnv.toAppendStream(aggregatedTable, Result.class);
result.addSink(sink); // write to an output stream to be picked up by a lambda function