Google bigquery 数据流到BigQuery和存储系统的延迟非常高
我们正在GCP中创建一个数据管道,并且在测试过程中面临一些问题。我们当前的架构是基于AWS的,为了测试,我们正在从Lambda realtime向pubsub推送一份数据副本Google bigquery 数据流到BigQuery和存储系统的延迟非常高,google-bigquery,google-cloud-storage,google-cloud-dataflow,Google Bigquery,Google Cloud Storage,Google Cloud Dataflow,我们正在GCP中创建一个数据管道,并且在测试过程中面临一些问题。我们当前的架构是基于AWS的,为了测试,我们正在从Lambda realtime向pubsub推送一份数据副本 面临从pubsub到BigQuery以及通过数据流存储的延迟问题(是否有办法按照表进行批量加载,而不是一次插入一个事件)我们有一个5分钟的窗口,5分钟后,我们按事件键对数据进行分组,以便存储,并在单个文件中写入该持续时间内的所有事件。我们可以在BigQuery中执行类似的操作,并仅为一个事件类型而不是所有事件定义一次模式
- 面临从pubsub到BigQuery以及通过数据流存储的延迟问题(是否有办法按照表进行批量加载,而不是一次插入一个事件)我们有一个5分钟的窗口,5分钟后,我们按事件键对数据进行分组,以便存储,并在单个文件中写入该持续时间内的所有事件。我们可以在BigQuery中执行类似的操作,并仅为一个事件类型而不是所有事件定义一次模式
- 工作人员的自动缩放未发生最小值为2,最大值为10
- 使用的所有服务均位于亚洲东北部1
- 我们通常每天收到300万条记录,这将是数据流的最佳服务器配置
package purplle.datapipeline; import static java.nio.charset.StandardCharsets.UTF_8; import java.net.SocketTimeoutException; import java.time.LocalDateTime; import java.time.ZoneId; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.io.TextIO; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; import org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinations; import org.apache.beam.sdk.io.gcp.bigquery.InsertRetryPolicy; import org.apache.beam.sdk.io.gcp.bigquery.TableDestination; import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO; import org.apache.beam.sdk.options.Default; import org.apache.beam.sdk.options.Description; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.GroupByKey; import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.SimpleFunction; import org.apache.beam.sdk.transforms.windowing.AfterProcessingTime; import org.apache.beam.sdk.transforms.windowing.GlobalWindows; import org.apache.beam.sdk.transforms.windowing.Repeatedly; import org.apache.beam.sdk.transforms.windowing.Window; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.ValueInSingleWindow; import org.joda.time.Duration; import org.json.JSONException; import org.json.JSONObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.api.services.bigquery.Bigquery; import com.google.api.services.bigquery.model.TableRow; import com.google.api.services.bigquery.model.TableSchema; import com.google.cloud.storage.Blob; import com.google.cloud.storage.BlobId; import com.google.cloud.storage.BlobInfo; import com.google.cloud.storage.Storage; import com.google.cloud.storage.StorageOptions; import purplle.datapipeline.buisness.EventSchemaBuilder; import purplle.datapipeline.buisness.Ordering; import purplle.datapipeline.common.Constants; import purplle.datapipeline.helpers.Event_ordering; import purplle.datapipeline.helpers.Event_schema; import purplle.datapipeline.helpers.JSON_helper; public class StarterPipeline { public interface StarterPipelineOption extends PipelineOptions { /** * Set this required option to specify where to read the input. */ @Description("Path of the file to read from") @Default.String(Constants.pubsub_event_pipeline_url) String getInputFile(); void setInputFile(String value); } @SuppressWarnings("serial") static class ParseJsonData_storage extends DoFn<String, KV<String, String>> { @ProcessElement public void processElement(ProcessContext c) throws JSONException { Logger log = LoggerFactory.getLogger(StarterPipeline.class); if (c.element().length() > 0 && JSON_helper.isJSONValid(c.element())) { JSONObject event_obj = new JSONObject(c.element()); if (event_obj.length() > 0 && event_obj.has("event")) { JSONObject ob2 = JSON_helper.flatJsonConvertKeyToLower(event_obj); if (ob2.length() > 0 && ob2.has("event")) { // Reorder the json object then pass to create pipe saperated string. KV<String, String> event_kv_pair = Event_ordering.order_event_columns(ob2, "storage"); if (!event_kv_pair.getKey().isEmpty() && event_kv_pair.getKey().length() > 0) { c.output(event_kv_pair); } else { log = LoggerFactory.getLogger(StarterPipeline.class); log.error("Storage string empty = " + c.element()); } } else { log = LoggerFactory.getLogger(StarterPipeline.class); log.error("Storage object error = " + c.element()); } } else { log = LoggerFactory.getLogger(StarterPipeline.class); log.error("Storage object error = " + c.element()); } } else { log = LoggerFactory.getLogger(StarterPipeline.class); log.error("Storage empty element = " + c.element()); } } } @SuppressWarnings("serial") static class ParseJsonData_bigquery extends DoFn<String, TableRow> { @ProcessElement public void processElement(ProcessContext c) throws JSONException { Logger log = LoggerFactory.getLogger(StarterPipeline.class); log.info("Event json = " + c.element()); if (!c.element().isEmpty() && JSON_helper.isJSONValid(c.element())) { JSONObject event_obj = new JSONObject(c.element()); if (event_obj.length() > 0 && event_obj.has("event")) { JSONObject ob2 = JSON_helper.flatJsonConvertKeyToLower(event_obj); if (ob2.length() > 0 && ob2.has("event")) { TableRow event_row = EventSchemaBuilder.get_event_row(ob2, "bigquery"); if (!event_row.isEmpty()) { c.output(event_row); } else { log = LoggerFactory.getLogger(StarterPipeline.class); log.error("Bigquery set event ordering schema error = " + c.element()); } } else { log = LoggerFactory.getLogger(StarterPipeline.class); log.error("Bigquery set event ordering object error = " + c.element()); } } else { log = LoggerFactory.getLogger(StarterPipeline.class); log.error("Bigquery event item object error = " + c.element()); } } else { log = LoggerFactory.getLogger(StarterPipeline.class); log.error("Bigquery event item error = " + c.element()); } } } @SuppressWarnings("serial") static class Write_to_GCS extends DoFn<KV<String, String>, TextIO.Write> { @ProcessElement public void processElement(ProcessContext c) throws JSONException { String event_string = c.element().getValue(); String event_name = c.element().getKey(); LocalDateTime now = LocalDateTime.now(ZoneId.of("Asia/Kolkata")); int year = now.getYear(); int month = now.getMonthValue(); int day = now.getDayOfMonth(); int hour = now.getHour(); int minute = now.getMinute(); int second = now.getSecond(); String storage_file_path = event_name + "/" + year + "/" + month + "/" + day + "/" + hour + "/" + event_name + "-" + year + "-" + month + "-" + day + "-" + hour + "-" + minute + "-" + second + ".txt"; Logger log = LoggerFactory.getLogger(StarterPipeline.class); log.info("Writing file to location = " + storage_file_path); // Create your service object Storage storage = StorageOptions.getDefaultInstance().getService(); // Upload a blob to the newly created bucket BlobId blobId = BlobId.of(Constants.gcp_events_bucket_name, storage_file_path); BlobInfo blobInfo = BlobInfo.newBuilder(blobId).setContentType("text/plain").build(); @SuppressWarnings("unused") Blob blob = storage.create(blobInfo, event_string.getBytes(UTF_8)); } } @SuppressWarnings("serial") public static class ReadEventJson_storage extends PTransform<PCollection<String>, PCollection<KV<String, String>>> { @Override public PCollection<KV<String, String>> expand(PCollection<String> lines) { Logger log = LoggerFactory.getLogger(StarterPipeline.class); log.info("Storage workflow started"); @SuppressWarnings("unused") Boolean tempbool = Event_ordering.setEventsOrdering(); // Convert lines of text into individual words. PCollection<KV<String, String>> words = lines.apply(ParDo.of(new ParseJsonData_storage())); return words; } } @SuppressWarnings("serial") public static class ReadEventJson_bigquery extends PTransform<PCollection<String>, PCollection<TableRow>> { @Override public PCollection<TableRow> expand(PCollection<String> lines) { Logger log = LoggerFactory.getLogger(StarterPipeline.class); log.info("Bigquery workflow started"); @SuppressWarnings("unused") Boolean tempbool = Event_ordering.setEventsOrdering(); log.info("Bigquery get event ordering"); Ordering events_ordering = Event_ordering.getEventsOrdering(); Event_schema es = new Event_schema(); es.setEventSchema(events_ordering); // Convert lines of text into individual words. PCollection<TableRow> table_row = lines.apply(ParDo.of(new ParseJsonData_bigquery())); log.info("Bigquery workflow rows prepared"); return table_row; } } /** A SimpleFunction that converts a Word and Count into a printable string. */ @SuppressWarnings("serial") public static class CombineEventStrings extends SimpleFunction<KV<String, Iterable<String>>, KV<String, String>> { @Override public KV<String, String> apply(KV<String, Iterable<String>> input) { String combined_event = ""; for (String combined_str : input.getValue()) { combined_event += combined_str + "\n"; } Logger log = LoggerFactory.getLogger(StarterPipeline.class); log.info("combined_event = " + combined_event); KV<String, String> return_kv = KV.of(input.getKey(), combined_event); return return_kv; } } @SuppressWarnings("serial") public static void main(String[] args) throws SocketTimeoutException { Logger log = LoggerFactory.getLogger(StarterPipeline.class); log.info("Events pipeline job started"); StarterPipelineOption options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(StarterPipelineOption.class); Pipeline p = Pipeline.create(options); log.info("Pipeline created"); log.info("Pipeline Started"); PCollection<String> datastream = p.apply("Read Events From Pubsub", PubsubIO.readStrings().fromSubscription(Constants.pubsub_event_pipeline_url)); // PCollection<String> windowed_items = // datastream.apply(Window.<String>into(FixedWindows.of(Duration.standardMinutes(1)))); // PCollection<String> windowed_items = datastream.apply( // Window.<String>into(SlidingWindows.of(Duration.standardMinutes(1)).every(Duration.standardSeconds(10)))); PCollection<String> windowed_items = datastream.apply(Window.<String>into(new GlobalWindows()) .triggering(Repeatedly.forever( AfterProcessingTime.pastFirstElementInPane().plusDelayOf(Duration.standardSeconds(300)))) .withAllowedLateness(Duration.standardDays(10)).discardingFiredPanes()); // Write to storage windowed_items.apply("Read and make pipe separated event string", new ReadEventJson_storage()) .apply("Combine events by keys", GroupByKey.<String, String>create()) .apply("Combine events strings by event name", MapElements.via(new CombineEventStrings())) .apply("Manually write events to GCS", ParDo.of(new Write_to_GCS())); // Write into Big Query windowed_items.apply("Read and make event table row", new ReadEventJson_bigquery()) .apply("Write_events_to_BQ", BigQueryIO.writeTableRows().to(new DynamicDestinations<TableRow, String>() { public String getDestination(ValueInSingleWindow<TableRow> element) { String destination = EventSchemaBuilder .fetch_destination_based_on_event(element.getValue().get("event").toString()); return destination; } @Override public TableDestination getTable(String table) { String destination = EventSchemaBuilder.fetch_table_name_based_on_event(table); return new TableDestination(destination, destination); } @Override public TableSchema getSchema(String table) { TableSchema table_schema = EventSchemaBuilder.fetch_table_schema_based_on_event(table); return table_schema; } }).withCreateDisposition(CreateDisposition.CREATE_NEVER) .withWriteDisposition(WriteDisposition.WRITE_APPEND) .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()) ); p.run().waitUntilFinish(); log.info("Events Pipeline Job Stopped"); } }
package purplle.datapipeline; 导入静态java.nio.charset.StandardCharsets.UTF_8; 导入java.net.SocketTimeoutException; 导入java.time.LocalDateTime; 导入java.time.ZoneId; 导入org.apache.beam.sdk.Pipeline; 导入org.apache.beam.sdk.io.TextIO; 导入org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers; 导入org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; 导入org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; 导入org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; 导入org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinations; 导入org.apache.beam.sdk.io.gcp.bigquery.InsertRetryPolicy; 导入org.apache.beam.sdk.io.gcp.bigquery.TableDestination; 导入org.apache.beam.sdk.io.gcp.pubsub.PubsubIO; 导入org.apache.beam.sdk.options.Default; 导入org.apache.beam.sdk.options.Description; 导入org.apache.beam.sdk.options.PipelineOptions; 导入org.apache.beam.sdk.options.pipelineoptions工厂; 导入org.apache.beam.sdk.transforms.DoFn; 导入org.apache.beam.sdk.transforms.GroupByKey; 导入org.apache.beam.sdk.transforms.MapElements; 导入org.apache.beam.sdk.transforms.ptTransform; 导入org.apache.beam.sdk.transforms.ParDo; 导入org.apache.beam.sdk.transforms.SimpleFunction; 导入org.apache.beam.sdk.transforms.windowing.AfterProcessingTime; 导入org.apache.beam.sdk.transforms.windowing.GlobalWindows; 重复导入org.apache.beam.sdk.transforms.windowing.com; 导入org.apache.beam.sdk.transforms.windowing.Window; 导入org.apache.beam.sdk.values.KV; 导入org.apache.beam.sdk.values.PCollection; 导入org.apache.beam.sdk.values.valuesinSingleWindow; 导入org.joda.time.Duration; 导入org.json.JSONException; 导入org.json.JSONObject; 导入org.slf4j.Logger; 导入org.slf4j.LoggerFactory; 导入com.google.api.services.bigquery.bigquery; 导入com.google.api.services.bigquery.model.TableRow; 导入com.google.api.services.bigquery.model.TableSchema; 导入com.google.cloud.storage.Blob; 导入com.google.cloud.storage.BlobId; 导入com.google.cloud.storage.BlobInfo; 导入com.google.cloud.storage.storage; 导入com.google.cloud.storage.StorageOptions; 导入purplle.datapipeline.buissness.EventSchemaBuilder; 导入purplle.datapipeline.buissness.Ordering; 导入purplle.datapipeline.common.Constants; 导入purplle.datapipeline.helpers.Event_排序; 导入purplle.datapipeline.helpers.Event_模式; 导入purplle.datapipeline.helpers.JSON\u helper; 公共类启动程序管道{ 公共接口启动器PipeLineOption扩展了PipelineOptions{ /** *设置此必需选项以指定读取输入的位置。 */ @说明(“要从中读取的文件的路径”) @Default.String(Constants.pubsub\u event\u pipeline\u url) 字符串getInputFile(); void setInputFile(字符串值); } @抑制警告(“串行”) 静态类ParseJsonData_存储扩展了DoFn{ @过程元素 public void processElement(ProcessContext c)抛出JSONException{ Logger log=LoggerFactory.getLogger(StarterPipeline.class); if(c.element().length()>0&&JSON\u helper.isJSONValid(c.element())){ JSONObject event_obj=新的JSONObject(c.element()); if(event_obj.length()>0&&event_obj.has(“事件”)){ JSONObject ob2=JSON_helper.flatJsonConvertKeyToLower(事件_obj); 如果(ob2.length()>0&&ob2.has(“事件”)){ //对json对象重新排序,然后传递以创建管道速率字符串。 KV事件对=事件顺序。顺序事件列(ob2,“存储”); 如果(!event_kv_pair.getKey().isEmpty()&&event_kv_pair.getKey().length()>0){ c、 输出(事件对); }否则{ log=LoggerFactory.getLogger(StarterPipeline.class); log.error(“存储字符串为空=“+c.element()); } }否则{ log=LoggerFactory.getLogger(StarterPipeline.class); log.error(“存储对象错误=“+c.element()); } }否则{ log=LoggerFactory.getLogger(StarterPipeline.class); log.error(“存储对象错误=“+c.element()); } }否则{ log=LoggerFactory.getLogger(StarterPipeline.class); log.error(“存储空元素=“+c.element()); } } } @抑制警告(“串行”) 静态类ParseJsonData_bigquery扩展了DoFn{ @过程元素 public void processElement(ProcessContext c)抛出JSONException{ Logger log=LoggerFactory.getLogger(StarterPipeline.class); log.info(“事件json=“+c.element()); if(!c.element().isEmpty()&&JSON_helper.isJSONValid(c.element())){ JSONObject event_obj=新的JSONObject(c.element()); if(event_obj.length()>0&&event_obj.has(“事件”)){ JSONObject ob2=JSON_helper.flatJsonConvertKeyToLower(事件_obj); 如果(ob2。