Google bigquery 在BigQuery Apache Beam中访问TableRow列
我正在努力 1.从云发布/订阅中读取JSON事件 2.使用文件加载,每15分钟将事件从Cloud Pub/Sub加载到BigQuery,以节省流式插入的成本 3.目标将根据JSON事件中的“用户id”和“活动id”字段而有所不同,“用户id”将是数据集名称,“活动id”将是表名称。分区名称来自事件时间戳 4.所有表的模式保持不变 我不熟悉Java和Beam。我想我的代码主要完成我想做的事情,我只是需要一点帮助 但我无法访问JSON消息中的“活动id”和“用户id”字段。 因此,我的事件没有路由到正确的表Google bigquery 在BigQuery Apache Beam中访问TableRow列,google-bigquery,google-cloud-dataflow,apache-beam,Google Bigquery,Google Cloud Dataflow,Apache Beam,我正在努力 1.从云发布/订阅中读取JSON事件 2.使用文件加载,每15分钟将事件从Cloud Pub/Sub加载到BigQuery,以节省流式插入的成本 3.目标将根据JSON事件中的“用户id”和“活动id”字段而有所不同,“用户id”将是数据集名称,“活动id”将是表名称。分区名称来自事件时间戳 4.所有表的模式保持不变 我不熟悉Java和Beam。我想我的代码主要完成我想做的事情,我只是需要一点帮助 但我无法访问JSON消息中的“活动id”和“用户id”字段。 因此,我的事件没有路由到
package ...;
import com.google.api.services.bigquery.model.TableSchema;
import javafx.scene.control.TableRow;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinations;
import org.apache.beam.sdk.io.gcp.bigquery.TableDestination;
import org.apache.beam.sdk.io.gcp.bigquery.TableRowJsonCoder;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.ValueInSingleWindow;
import org.joda.time.Duration;
import org.joda.time.Instant;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method.FILE_LOADS;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition.WRITE_APPEND;
public class ClickLogConsumer {
private static final int BATCH_INTERVAL_SECS = 15 * 60;
private static final String PROJECT = "pure-app";
public static PTransform<PCollection<String>, PCollection<com.google.api.services.bigquery.model.TableRow>> jsonToTableRow() {
return new JsonToTableRow();
}
private static class JsonToTableRow
extends PTransform<PCollection<String>, PCollection<com.google.api.services.bigquery.model.TableRow>> {
@Override
public PCollection<com.google.api.services.bigquery.model.TableRow> expand(PCollection<String> stringPCollection) {
return stringPCollection.apply("JsonToTableRow", MapElements.<String, com.google.api.services.bigquery.model.TableRow>via(
new SimpleFunction<String, com.google.api.services.bigquery.model.TableRow>() {
@Override
public com.google.api.services.bigquery.model.TableRow apply(String json) {
try {
InputStream inputStream = new ByteArrayInputStream(
json.getBytes(StandardCharsets.UTF_8.name()));
//OUTER is used here to prevent EOF exception
return TableRowJsonCoder.of().decode(inputStream, Coder.Context.OUTER);
} catch (IOException e) {
throw new RuntimeException("Unable to parse input", e);
}
}
}));
}
}
public static void main(String[] args) throws Exception {
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply(PubsubIO.readStrings().withTimestampAttribute("timestamp").fromTopic("projects/pureapp-199410/topics/clicks"))
.apply(jsonToTableRow())
.apply("WriteToBQ",
BigQueryIO.writeTableRows()
.withMethod(FILE_LOADS)
.withWriteDisposition(WRITE_APPEND)
.withCreateDisposition(CREATE_IF_NEEDED)
.withTriggeringFrequency(Duration.standardSeconds(BATCH_INTERVAL_SECS))
.withoutValidation()
.to(new DynamicDestinations<TableRow, String>() {
@Override
public String getDestination(ValueInSingleWindow<TableRow> element) {
String tableName = "campaign_id"; // JSON message in Pub/Sub has "campaign_id" field, how do I access it here?
String datasetName = "user_id"; // JSON message in Pub/Sub has "user_id" field, how do I access it here?
Instant eventTimestamp = element.getTimestamp();
String partition = new SimpleDateFormat("yyyyMMdd").format(eventTimestamp);
return String.format("%s:%s.%s$%s", PROJECT, datasetName, tableName, partition);
}
@Override
public TableDestination getTable(String table) {
return new TableDestination(table, null);
}
@Override
public TableSchema getSchema(String destination) {
return getTableSchema();
}
}));
pipeline.run();
}
}
包。。。;
导入com.google.api.services.bigquery.model.TableSchema;
导入javafx.scene.control.TableRow;
导入org.apache.beam.sdk.Pipeline;
导入org.apache.beam.sdk.coders.Coder;
导入org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
导入org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinations;
导入org.apache.beam.sdk.io.gcp.bigquery.TableDestination;
导入org.apache.beam.sdk.io.gcp.bigquery.TableRowJsonCoder;
导入org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
导入org.apache.beam.sdk.transforms.MapElements;
导入org.apache.beam.sdk.transforms.ptTransform;
导入org.apache.beam.sdk.transforms.SimpleFunction;
导入org.apache.beam.sdk.values.PCollection;
导入org.apache.beam.sdk.values.valuesinSingleWindow;
导入org.joda.time.Duration;
导入org.joda.time.Instant;
导入java.io.ByteArrayInputStream;
导入java.io.IOException;
导入java.io.InputStream;
导入java.nio.charset.StandardCharset;
导入java.text.simpleDataFormat;
导入静态org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition.CREATE_(如果需要);
导入静态org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method.FILE\u加载;
导入静态org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition.Write\u APPEND;
公共类ClickLogConsumer{
专用静态最终整数批次间隔=15*60;
私有静态最终字符串PROJECT=“pure-app”;
公共静态PTransform jsonToTableRow(){
返回新的JsonToTableRow();
}
私有静态类JsonToTableRow
转移{
@凌驾
公共PCollection扩展(PCollection stringPCollection){
返回stringPCollection.apply(“JsonToTableRow”,MapElements.via(
新的SimpleFunction(){
@凌驾
public com.google.api.services.bigquery.model.TableRow应用(字符串json){
试一试{
InputStream InputStream=新的ByteArrayInputStream(
getBytes(StandardCharsets.UTF_8.name());
//这里使用OUTER来防止EOF异常
返回TableRowJsonCoder.of().decode(inputStream,Coder.Context.OUTER);
}捕获(IOE异常){
抛出新的RuntimeException(“无法解析输入”,e);
}
}
}));
}
}
公共静态void main(字符串[]args)引发异常{
Pipeline=Pipeline.create(选项);
管道
.apply(PubsubIO.readStrings().withtimestamp属性(“timestamp”).fromTopic(“projects/pureapp-199410/topics/clicks”))
.apply(jsonToTableRow())
.apply(“WriteToBQ”,
BigQueryIO.writeTableRows()
.withMethod(文件加载)
.带writedisposition(WRITE_APPEND)
.withCreateDisposition(如果需要,则创建)
.带触发频率(持续时间.标准秒(批次间隔秒))
.未经验证()
.to(新的动态估计){
@凌驾
公共字符串getDestination(ValueInSingleWindow元素){
String tableName=“campaign\u id”;//发布/订阅中的JSON消息具有“campaign\u id”字段,如何在此处访问它?
String datasetName=“user\u id”;//发布/订阅中的JSON消息具有“user\u id”字段,如何在此处访问它?
Instant eventTimestamp=element.getTimestamp();
字符串分区=新的SimpleDataFormat(“yyyyMMdd”).format(eventTimestamp);
返回字符串.format(“%s:%s.%s$%s”,项目,数据集名,表名,分区);
}
@凌驾
公共表目标可获取(字符串表){
返回新的TableDestination(table,null);
}
@凌驾
公共表模式getSchema(字符串目标){
返回getTableSchema();
}
}));
pipeline.run();
}
}
根据阅读,我得出了上述代码:
一,
二,
三,
四,
五,
更新
import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.api.services.bigquery.model.TimePartitioning;
import com.google.common.collect.ImmutableList;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.io.gcp.bigquery.TableDestination;
import org.apache.beam.sdk.io.gcp.bigquery.TableRowJsonCoder;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.values.PCollection;
import org.joda.time.Duration;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method.FILE_LOADS;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition.WRITE_APPEND;
public class ClickLogConsumer {
private static final int BATCH_INTERVAL_SECS = 15 * 60;
private static final String PROJECT = "pure-app";
public static PTransform<PCollection<String>, PCollection<TableRow>> jsonToTableRow() {
return new JsonToTableRow();
}
private static class JsonToTableRow
extends PTransform<PCollection<String>, PCollection<TableRow>> {
@Override
public PCollection<TableRow> expand(PCollection<String> stringPCollection) {
return stringPCollection.apply("JsonToTableRow", MapElements.<String, com.google.api.services.bigquery.model.TableRow>via(
new SimpleFunction<String, TableRow>() {
@Override
public TableRow apply(String json) {
try {
InputStream inputStream = new ByteArrayInputStream(
json.getBytes(StandardCharsets.UTF_8.name()));
//OUTER is used here to prevent EOF exception
return TableRowJsonCoder.of().decode(inputStream, Coder.Context.OUTER);
} catch (IOException e) {
throw new RuntimeException("Unable to parse input", e);
}
}
}));
}
}
public static void main(String[] args) throws Exception {
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply(PubsubIO.readStrings().withTimestampAttribute("timestamp").fromTopic("projects/pureapp-199410/topics/clicks"))
.apply(jsonToTableRow())
.apply(BigQueryIO.write()
.withTriggeringFrequency(Duration.standardSeconds(BATCH_INTERVAL_SECS))
.withMethod(FILE_LOADS)
.withWriteDisposition(WRITE_APPEND)
.withCreateDisposition(CREATE_IF_NEEDED)
.withSchema(new TableSchema().setFields(
ImmutableList.of(
new TableFieldSchema().setName("timestamp").setType("TIMESTAMP"),
new TableFieldSchema().setName("exchange").setType("STRING"))))
.to((row) -> {
String datasetName = row.getValue().get("user_id").toString();
String tableName = row.getValue().get("campaign_id").toString();
return new TableDestination(String.format("%s:%s.%s", PROJECT, datasetName, tableName), "Some destination");
})
.withTimePartitioning(new TimePartitioning().setField("timestamp")));
pipeline.run();
}
}
import com.google.api.services.bigquery.model.TableFieldSchema;
导入com.google.api.services.bigquery.model.TableRow;
导入com.google.api.services.bigquery.model.TableSchema;
导入com.google.api.services.bigquery.model.TimePartitioning;
导入com.google.common.collect.ImmutableList;
导入org.apache.beam.sdk.Pipeline;
导入org.apache.beam.sdk.coders.Coder;
我