Google cloud dataflow ApacheBeam/Google云数据流-有没有在管道中定期加载参考表的解决方案?
我需要使用ApacheBeamAPI实现一个管道,并将在Google云数据流上运行该管道。管道逻辑如下所示:Google cloud dataflow ApacheBeam/Google云数据流-有没有在管道中定期加载参考表的解决方案?,google-cloud-dataflow,apache-beam,Google Cloud Dataflow,Apache Beam,我需要使用ApacheBeamAPI实现一个管道,并将在Google云数据流上运行该管道。管道逻辑如下所示: 从卡夫卡读取实时摄取的无限事件(称之为“RawEvent”) 从Google BigQuery加载一个引用表(该表每天都会更新,因此管道每天都需要在某个时间点加载它,称之为“RefTable”) 对于每个RawEvent,如果id显示在RefTable中,则放弃RawEvent,否则将其添加到最终输出中 所以基本上我需要让RefTable作为一种“静态引用”在管道中保留一天,然后每天重新
BigQueryIO
加载我的RefTable,将其作为一个侧输入,它是否对所有无边界的RawEvent都可用?或者,如果窗口通过,那么RefTable就不见了?(另外,我甚至不知道是否需要对RefTable应用窗口/触发,因为它是有界数据) PCollection<Long> ticks = p
// Produce 1 "tick" per 1 second
.apply(GenerateSequence.from(0).withRate(1L,Duration.standardSeconds(1)))
// Window the ticks into 1-minute windows
.apply(Window.into(FixedWindows.of(Duration.standardMinutes(1)))
)
// Use an arbitrary per-window combiner to reduce to 1 element per window
.apply(Combine.globally(Count.<Long>combineFn()).withoutDefaults());
// Produce a collection of maps, 1 per each 1-minute window
PCollectionView<Map<String,String>> banedDeviceMapView = ticks
.apply("Load Ref Table"
,ParDo.of(new DoFn<Long,KV<String,String>>(){
@ProcessElement
public void processElement(ProcessContext c)
{
TableId table = TableId.of("project","dataset","RefTable");
TableResult tableData =
BIGQUERY_CLIENT.listTableData(table,GetSchema());
Map<String,String> resultMap = new HashMap();
for (FieldValueList row : tableData.iterateAll()) {
Object key = row.get("HardwareId").getValue();
if(key!=null)
{
String hardwareId = (String)key;
resultMap.putIfAbsent(hardwareId,hardwareId);
}
}
int num = 0;
for (Map.Entry<String, String> entry : resultMap.entrySet()) {
c.output(KV.of(entry.getKey(),entry.getValue()));
num++;
}
System.out.println(num + " banded devices have been loaded.");
}
})
)
.apply(View.asMap());
PCollection<KafkaRecord<String, GPS>> rawLoad = p.apply("Read Events from Kafka"
, KafkaIO.<String, GPS>read()
.withBootstrapServers("localhost:9092")
.withTopic(SOURCE_GPS_TOPIC_NAME)
.withKeyDeserializer(StringDeserializer.class)
.withValueDeserializer(GPSEventAvroDeserializer.class)
.updateConsumerProperties(ImmutableMap.of(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"))
);
PCollection<KV<String, GPS>> validGps = rawLoad.apply("Extract Gps from Kafka", ParDo.of(
new DoFn<KafkaRecord<String, GPS>, KV<String, GPS>>() {
@ProcessElement
public void processElement(ProcessContext c) {
System.out.print("KafkaRecord: KV.of("+ c.element().getKV().getKey());
System.out.println("," + c.element().getKV().getValue().getSpeed() + ")");
c.output(c.element().getKV());
}
}))
.apply("Windowing Raw Events",Window.into(FixedWindows.of(Duration.standardSeconds(1)))
)
.apply("Filter",ParDo.of(
new DoFn<KV<String,GPS>,KV<String,GPS>>(){
@ProcessElement
public void processElement(ProcessContext c){
Map<String,String> bandedDevices = c.sideInput(banedDeviceMapView);
String deviceId = c.element().getKey();
System.out.print("Checking device: "+ deviceId);
System.out.println(" - in bandedDevices? " + bandedDevices.containsKey(deviceId));
if(!bandedDevices.containsKey(deviceId)){
c.output(c.element());
}else{
System.out.println("Device " + deviceId + " is removed from results");
}
}
}).withSideInputs(banedDeviceMapView)
);
PCollection ticks=p
//每1秒产生1个“滴答声”
.apply(从(0)生成序列,速率为(1L,持续时间为.standardSeconds(1)))
//将滴答声窗口设置为1分钟窗口
.apply(Window.into(FixedWindows.of(Duration.standardMinutes(1)))
)
//使用任意的每窗口组合器将每个窗口减少到1个元素
.apply(Combine.globally(Count.combineFn()).withoutDefaults());
//制作一组地图,每1分钟一张
PCollectionView-banedDeviceMapView=ticks
.apply(“加载参考表”
,ParDo.of(新DoFn(){
@过程元素
公共void processElement(ProcessContext c)
{
TableId table=TableId.of(“项目”、“数据集”、“参考表”);
TableResult表格数据=
listTableData(表,GetSchema());
Map resultMap=new HashMap();
for(FieldValueList行:tableData.iterateAll()){
对象键=row.get(“HardwareId”).getValue();
if(key!=null)
{
字符串硬件ID=(字符串)键;
结果图:putIfAbsent(hardwareId,hardwareId);
}
}
int num=0;
对于(Map.Entry:resultMap.entrySet()){
c、 输出(KV.of(entry.getKey(),entry.getValue());
num++;
}
System.out.println(num+“已加载带状设备”);
}
})
)
.apply(View.asMap());
PCollection rawLoad=p.apply(“从卡夫卡读取事件”
,KafkaIO.read()
.WithBootstrapServer(“本地主机:9092”)
.withTopic(来源\全球定位系统\主题\名称)
.withKeyDeserializer(StringDeserializer.class)
.withValueDeserializer(GPSEventAvroDeserializer.class)
.updateConsumerProperties(ImmutableMap.of(ConsumerConfig.AUTO\u OFFSET\u RESET\u CONFIG,“最早”))
);
PCollection validGps=rawLoad.apply(“从卡夫卡提取Gps”),法国巴黎(
新DoFn(){