Google cloud dataflow 使用Apache梁进行多列连接
我有两个文件prices_1.txt(源代码)和products_1.txt(目标代码)。我已经成功地编写了一个程序,将单列连接在两侧。但现在我想连接多个列,但不知道如何使用ApacheBeam来实现这一点。 下面给出了单连接程序以及我正在使用的数据文件。 下面的程序对两个文件中的滑动列执行联接。PRODID,我怎样才能加入你的团队?? 请引导我通过这个 价格_1.txtGoogle cloud dataflow 使用Apache梁进行多列连接,google-cloud-dataflow,apache-beam,Google Cloud Dataflow,Apache Beam,我有两个文件prices_1.txt(源代码)和products_1.txt(目标代码)。我已经成功地编写了一个程序,将单列连接在两侧。但现在我想连接多个列,但不知道如何使用ApacheBeam来实现这一点。 下面给出了单连接程序以及我正在使用的数据文件。 下面的程序对两个文件中的滑动列执行联接。PRODID,我怎样才能加入你的团队?? 请引导我通过这个 价格_1.txt SLID,PRODID,REGIONID,STDPRICE,MINPRICE,STARTDATE,ENDDATE 9,100
SLID,PRODID,REGIONID,STDPRICE,MINPRICE,STARTDATE,ENDDATE
9,100860,101,130,124,2002-01-01,2002-12-31
4,100860,102,132,125.6,2002-01-01,2003-05-31
7,100860,103,135,128,2003-06-01,
11,100861,105,239,231.2,2002-01-01,2002-12-31
2,100861,107,242,233.6,2003-01-01,2003-05-31
6,100861,106,245,236,2003-06-01,
4,100870,104,122.8,122.4,2003-01-01,
3,100871,101,154,153.2,2002-01-01,2002-12-31
1,100890,108,445,440.5,2003-06-01,2003-07-31
5,100890,105,449.7,446.4,2002-01-01,
10,101863,102,98.0,99.1,2002-04-01,2003-03-15
8,102130,103,178.9,182.5,2002-07-01,2003-04-12
产品\u 1.txt
SLID,PRODID,NAME
4,100860,"Motherboard"
2,100861,"Flat Monitor"
3,100870,"Processor 5 GHZ"
1,100871,"Printer"
8,100890,"Digital Camera"
11,101860,"Memory Card 1GB"
9,101863,"Video Accelerator"
10,102130,"Scanner"
6,200376,"Network card"
7,200380,"Flash card"
5,300001,"LCD Monitor"
12,10987,"Mouse"
程序代码
public class JoinExample {
public static void main(String[] args) throws Exception
{
long start = System.currentTimeMillis();
PipelineOptions options = PipelineOptionsFactory.create().as(HadoopFileSystemOptions.class);
options.setRunner(SparkRunner.class);
Pipeline pipeline = Pipeline.create(options);
PCollection<String> prices = pipeline.apply(TextIO.read().from("/home/ICEStore/apachebeam/prices_1.txt"));
PCollection<String> products = pipeline.apply(TextIO.read().from("/home/ICEStore/apachebeam/products_1.txt"));
PCollection<String> formattedResults = joinEvents(prices , products);
formattedResults.apply(TextIO.write().to("/home/ICEStore/apachebeam/temp/join").withoutSharding());
pipeline.run().waitUntilFinish();
long end = System.currentTimeMillis();
System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++Groovy End :: "+TimeUnit.MILLISECONDS.toSeconds(end-start));
}
static PCollection<String> joinEvents(PCollection<String> prices,PCollection<String> products) throws Exception
{
final TupleTag<String> priceInfoTag = new TupleTag<String>();
final TupleTag<String> productInfoTag = new TupleTag<String>();
PCollection<KV<String, String>> pricesInfo = prices.apply(ParDo.of(new ExtractPricesDataFn()));
PCollection<KV<String, String>> productsInfo = products.apply(ParDo.of(new ExtractProductsDataFn()));
PCollection<KV<String, CoGbkResult>> kvpCollection = KeyedPCollectionTuple
.of(priceInfoTag, pricesInfo)
.and(productInfoTag, productsInfo)
.apply(CoGroupByKey.<String>create());
PCollection<KV<String, String>> finalResultCollection =
kvpCollection.apply(ParDo.of(
new DoFn<KV<String, CoGbkResult>, KV<String, String>>() {
@ProcessElement
public void processElement(ProcessContext c) {
KV<String, CoGbkResult> e = c.element();
Iterator<String> iter1 = e.getValue().getAll(priceInfoTag).iterator();
int lhsCount = 0;
Iterator<String> iter2 = e.getValue().getAll(productInfoTag).iterator();
c.output(KV.of(e.getKey(), (iter1.next() + iter2.next())));
}
}));
PCollection<String> formattedResults = finalResultCollection
.apply(ParDo.of(new DoFn<KV<String, String>, String>() {
@ProcessElement
public void processElement(ProcessContext c) {
String outputstring = "Country code: " + c.element().getKey()
+ ", " + c.element().getValue();
c.output(c.element().getValue());
}
}));
return formattedResults;
}
static class ExtractPricesDataFn extends DoFn<String, KV<String, String>> {
@ProcessElement
public void processElement(ProcessContext context) throws Exception {
String[] row = context.element().split(",");
context.output(KV.of(row[0], context.element()));
}
}
static class ExtractProductsDataFn extends DoFn<String, KV<String, String>> {
@ProcessElement
public void processElement(ProcessContext context) throws Exception {
String[] row = context.element().split(",");
context.output(KV.of(row[0], context.element()));
}
}
}
公共类连接示例{
公共静态void main(字符串[]args)引发异常
{
长启动=System.currentTimeMillis();
PipelineOptions=PipelineOptionsFactory.create().as(HadoopFileSystemOptions.class);
options.setRunner(SparkRunner.class);
Pipeline=Pipeline.create(选项);
PCollection prices=pipeline.apply(TextIO.read().from(“/home/ICEStore/apachebeam/prices_1.txt”);
PCollection products=pipeline.apply(TextIO.read().from(“/home/ICEStore/apachebeam/products_1.txt”);
PCollection formattedResults=联合事件(价格、产品);
格式化结果。应用(TextIO.write().to(“/home/ICEStore/apachebeam/temp/join”)。无需硬处理();
pipeline.run().waitUntilFinish();
long end=System.currentTimeMillis();
System.out.println(“+TimeUnit.millizes.toSeconds(End-start));
}
静态PCollection joinEvents(PCollection prices、PCollection products)引发异常
{
final TupleTag priceInfoTag=新TupleTag();
final TupleTag productInfoTag=新TupleTag();
PCollection pricesInfo=prices.apply(新提取价格datafn()的第1部分);
PCollection productsInfo=products.apply(新提取产品datafn())的ParDo.of;
PCollection kvpCollection=KeyedPCollectionTuple
.of(priceInfoTag、pricesInfo)
.和(productInfoTag、productsInfo)
.apply(CoGroupByKey.create());
PCollection finalResultCollection=
kvpCollection.apply(第页)(
新DoFn(){
@过程元素
公共void processElement(ProcessContext c){
KV e=c.元件();
迭代器iter1=e.getValue().getAll(priceInfoTag).Iterator();
int lhscont=0;
迭代器iter2=e.getValue().getAll(productInfoTag).Iterator();
c、 输出(千伏of(例如getKey(),(iter1.next()+iter2.next());
}
}));
PCollection formattedResults=finalResultCollection
.适用(新DoFn()的第{
@过程元素
公共void processElement(ProcessContext c){
String outputstring=“国家代码:”+c.element().getKey()
+“,”+c.element().getValue();
c、 输出(c.element().getValue());
}
}));
返回格式化结果;
}
静态类ExtractPricesDataFn扩展了DoFn{
@过程元素
public void processElement(ProcessContext上下文)引发异常{
字符串[]行=context.element().split(“,”);
context.output(第[0]行,context.element()的千伏数);
}
}
静态类ExtractProductsDataFn扩展了DoFn{
@过程元素
public void processElement(ProcessContext上下文)引发异常{
字符串[]行=context.element().split(“,”);
context.output(第[0]行,context.element()的千伏数);
}
}
}
简单的解决方法是在ExtractProductsDataFn
和ExtractPricesDataFn
下面的示例:
static class ExtractPricesDataFn extends DoFn<String, KV<String, String>> {
@ProcessElement
public void processElement(ProcessContext context) throws Exception {
String[] row = context.element().split(",");
context.output(KV.of(row[0] + "," + row[1], context.element()));
}
static class ExtractProductsDataFn extends DoFn<String, KV<String, String>> {
@ProcessElement
public void processElement(ProcessContext context) throws Exception {
String[] row = context.element().split(",");
context.output(KV.of(row[0] + "," + row[1], context.element()));
}
}
静态类ExtractPricesDataFn扩展了DoFn{
@过程元素
public void processElement(ProcessContext上下文)引发异常{
字符串[]行=context.element().split(“,”);
context.output(第[0]+“,“+第[1]行,context.element())的千伏数);
}
静态类ExtractProductsDataFn扩展了DoFn{
@过程元素
public void processElement(ProcessContext上下文)引发异常{
字符串[]行=context.element().split(“,”);
context.output(第[0]+“,“+第[1]行,context.element())的千伏数);
}
}
谢谢回复,将其与多个数据类型一起使用将失败。从您的问题中,我了解到您希望同时加入“slided”和“PRODID”,对吗?您能否编辑此帖子,以获得“所需”的输出?