Google bigquery 数据流到BigQuery和存储系统的延迟非常高

Google bigquery 数据流到BigQuery和存储系统的延迟非常高,google-bigquery,google-cloud-storage,google-cloud-dataflow,Google Bigquery,Google Cloud Storage,Google Cloud Dataflow,我们正在GCP中创建一个数据管道,并且在测试过程中面临一些问题。我们当前的架构是基于AWS的,为了测试,我们正在从Lambda realtime向pubsub推送一份数据副本 面临从pubsub到BigQuery以及通过数据流存储的延迟问题(是否有办法按照表进行批量加载,而不是一次插入一个事件)我们有一个5分钟的窗口,5分钟后,我们按事件键对数据进行分组,以便存储,并在单个文件中写入该持续时间内的所有事件。我们可以在BigQuery中执行类似的操作,并仅为一个事件类型而不是所有事件定义一次模式

我们正在GCP中创建一个数据管道,并且在测试过程中面临一些问题。我们当前的架构是基于AWS的,为了测试,我们正在从Lambda realtime向pubsub推送一份数据副本

  • 面临从pubsub到BigQuery以及通过数据流存储的延迟问题(是否有办法按照表进行批量加载,而不是一次插入一个事件)我们有一个5分钟的窗口,5分钟后,我们按事件键对数据进行分组,以便存储,并在单个文件中写入该持续时间内的所有事件。我们可以在BigQuery中执行类似的操作,并仅为一个事件类型而不是所有事件定义一次模式
  • 工作人员的自动缩放未发生最小值为2,最大值为10
  • 使用的所有服务均位于亚洲东北部1
  • 我们通常每天收到300万条记录,这将是数据流的最佳服务器配置

    package purplle.datapipeline;
    import static java.nio.charset.StandardCharsets.UTF_8;
    
    import java.net.SocketTimeoutException;
    import java.time.LocalDateTime;
    import java.time.ZoneId;
    
    import org.apache.beam.sdk.Pipeline;
    import org.apache.beam.sdk.io.TextIO;
    import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers;
    import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
    import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
    import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
    import org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinations;
    import org.apache.beam.sdk.io.gcp.bigquery.InsertRetryPolicy;
    import org.apache.beam.sdk.io.gcp.bigquery.TableDestination;
    import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
    import org.apache.beam.sdk.options.Default;
    import org.apache.beam.sdk.options.Description;
    import org.apache.beam.sdk.options.PipelineOptions;
    import org.apache.beam.sdk.options.PipelineOptionsFactory;
    import org.apache.beam.sdk.transforms.DoFn;
    import org.apache.beam.sdk.transforms.GroupByKey;
    import org.apache.beam.sdk.transforms.MapElements;
    import org.apache.beam.sdk.transforms.PTransform;
    import org.apache.beam.sdk.transforms.ParDo;
    import org.apache.beam.sdk.transforms.SimpleFunction;
    import org.apache.beam.sdk.transforms.windowing.AfterProcessingTime;
    import org.apache.beam.sdk.transforms.windowing.GlobalWindows;
    import org.apache.beam.sdk.transforms.windowing.Repeatedly;
    import org.apache.beam.sdk.transforms.windowing.Window;
    import org.apache.beam.sdk.values.KV;
    import org.apache.beam.sdk.values.PCollection;
    import org.apache.beam.sdk.values.ValueInSingleWindow;
    import org.joda.time.Duration;
    import org.json.JSONException;
    import org.json.JSONObject;
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    
    import com.google.api.services.bigquery.Bigquery;
    import com.google.api.services.bigquery.model.TableRow;
    import com.google.api.services.bigquery.model.TableSchema;
    import com.google.cloud.storage.Blob;
    import com.google.cloud.storage.BlobId;
    import com.google.cloud.storage.BlobInfo;
    import com.google.cloud.storage.Storage;
    import com.google.cloud.storage.StorageOptions;
    
    import purplle.datapipeline.buisness.EventSchemaBuilder;
    import purplle.datapipeline.buisness.Ordering;
    import purplle.datapipeline.common.Constants;
    import purplle.datapipeline.helpers.Event_ordering;
    import purplle.datapipeline.helpers.Event_schema;
    import purplle.datapipeline.helpers.JSON_helper;
    
    public class StarterPipeline {
    
    
    public interface StarterPipelineOption extends PipelineOptions {
    
        /**
         * Set this required option to specify where to read the input.
         */
        @Description("Path of the file to read from")
        @Default.String(Constants.pubsub_event_pipeline_url)
        String getInputFile();
    
        void setInputFile(String value);
    
    }
    
    @SuppressWarnings("serial")
    static class ParseJsonData_storage extends DoFn<String, KV<String, String>> {
    
        @ProcessElement
        public void processElement(ProcessContext c) throws JSONException {
            Logger log = LoggerFactory.getLogger(StarterPipeline.class);
    
            if (c.element().length() > 0 && JSON_helper.isJSONValid(c.element())) {
                JSONObject event_obj = new JSONObject(c.element());
                if (event_obj.length() > 0 && event_obj.has("event")) {
                    JSONObject ob2 = JSON_helper.flatJsonConvertKeyToLower(event_obj);
                    if (ob2.length() > 0 && ob2.has("event")) {
                        // Reorder the json object then pass to create pipe saperated string.
                        KV<String, String> event_kv_pair = Event_ordering.order_event_columns(ob2, "storage");
                        if (!event_kv_pair.getKey().isEmpty() && event_kv_pair.getKey().length() > 0) {
                            c.output(event_kv_pair);
                        } else {
                            log = LoggerFactory.getLogger(StarterPipeline.class);
                            log.error("Storage string empty = " + c.element());
                        }
                    } else {
                        log = LoggerFactory.getLogger(StarterPipeline.class);
                        log.error("Storage object error = " + c.element());
                    }
                } else {
                    log = LoggerFactory.getLogger(StarterPipeline.class);
                    log.error("Storage object error = " + c.element());
                }
            } else {
                log = LoggerFactory.getLogger(StarterPipeline.class);
                log.error("Storage empty element = " + c.element());
            }
        }
    }
    
    @SuppressWarnings("serial")
    static class ParseJsonData_bigquery extends DoFn<String, TableRow> {
        @ProcessElement
        public void processElement(ProcessContext c) throws JSONException {
            Logger log = LoggerFactory.getLogger(StarterPipeline.class);
            log.info("Event json = " + c.element());
            if (!c.element().isEmpty() && JSON_helper.isJSONValid(c.element())) {
                JSONObject event_obj = new JSONObject(c.element());
                if (event_obj.length() > 0 && event_obj.has("event")) {
                    JSONObject ob2 = JSON_helper.flatJsonConvertKeyToLower(event_obj);
                    if (ob2.length() > 0 && ob2.has("event")) {
                        TableRow event_row = EventSchemaBuilder.get_event_row(ob2, "bigquery");
                        if (!event_row.isEmpty()) {
                            c.output(event_row);
                        } else {
                            log = LoggerFactory.getLogger(StarterPipeline.class);
                            log.error("Bigquery set event ordering schema error = " + c.element());
                        }
                    } else {
                        log = LoggerFactory.getLogger(StarterPipeline.class);
                        log.error("Bigquery set event ordering object error = " + c.element());
                    }
                } else {
                    log = LoggerFactory.getLogger(StarterPipeline.class);
                    log.error("Bigquery event item object error = " + c.element());
                }
            } else {
                log = LoggerFactory.getLogger(StarterPipeline.class);
                log.error("Bigquery event item error = " + c.element());
            }
        }
    }
    
    @SuppressWarnings("serial")
    static class Write_to_GCS extends DoFn<KV<String, String>, TextIO.Write> {
        @ProcessElement
        public void processElement(ProcessContext c) throws JSONException {
    
            String event_string = c.element().getValue();
            String event_name = c.element().getKey();
    
            LocalDateTime now = LocalDateTime.now(ZoneId.of("Asia/Kolkata"));
            int year = now.getYear();
            int month = now.getMonthValue();
            int day = now.getDayOfMonth();
            int hour = now.getHour();
            int minute = now.getMinute();
            int second = now.getSecond();
    
            String storage_file_path = event_name + "/" + year + "/" + month + "/" + day + "/" + hour + "/" + event_name
            + "-" + year + "-" + month + "-" + day + "-" + hour + "-" + minute + "-" + second + ".txt";
    
            Logger log = LoggerFactory.getLogger(StarterPipeline.class);
            log.info("Writing file to location = " + storage_file_path);
    
            // Create your service object
            Storage storage = StorageOptions.getDefaultInstance().getService();
    
            // Upload a blob to the newly created bucket
            BlobId blobId = BlobId.of(Constants.gcp_events_bucket_name, storage_file_path);
            BlobInfo blobInfo = BlobInfo.newBuilder(blobId).setContentType("text/plain").build();
            @SuppressWarnings("unused")
            Blob blob = storage.create(blobInfo, event_string.getBytes(UTF_8));
    
        }
    }
    
    @SuppressWarnings("serial")
    public static class ReadEventJson_storage extends PTransform<PCollection<String>, PCollection<KV<String, String>>> {
        @Override
        public PCollection<KV<String, String>> expand(PCollection<String> lines) {
    
            Logger log = LoggerFactory.getLogger(StarterPipeline.class);
            log.info("Storage workflow started");
    
            @SuppressWarnings("unused")
            Boolean tempbool = Event_ordering.setEventsOrdering();
            // Convert lines of text into individual words.
            PCollection<KV<String, String>> words = lines.apply(ParDo.of(new ParseJsonData_storage()));
    
            return words;
        }
    }
    
    @SuppressWarnings("serial")
    public static class ReadEventJson_bigquery extends PTransform<PCollection<String>, PCollection<TableRow>> {
        @Override
        public PCollection<TableRow> expand(PCollection<String> lines) {
    
            Logger log = LoggerFactory.getLogger(StarterPipeline.class);
            log.info("Bigquery workflow started");
    
            @SuppressWarnings("unused")
            Boolean tempbool = Event_ordering.setEventsOrdering();
    
            log.info("Bigquery get event ordering");
            Ordering events_ordering = Event_ordering.getEventsOrdering();
    
            Event_schema es = new Event_schema();
            es.setEventSchema(events_ordering);
    
            // Convert lines of text into individual words.
            PCollection<TableRow> table_row = lines.apply(ParDo.of(new ParseJsonData_bigquery()));
    
            log.info("Bigquery workflow rows prepared");
    
            return table_row;
        }
    }
    
    /** A SimpleFunction that converts a Word and Count into a printable string. */
    @SuppressWarnings("serial")
    public static class CombineEventStrings extends SimpleFunction<KV<String, Iterable<String>>, KV<String, String>> {
    
        @Override
        public KV<String, String> apply(KV<String, Iterable<String>> input) {
    
            String combined_event = "";
    
            for (String combined_str : input.getValue()) {
                combined_event += combined_str + "\n";
            }
    
            Logger log = LoggerFactory.getLogger(StarterPipeline.class);
            log.info("combined_event = " + combined_event);
    
            KV<String, String> return_kv = KV.of(input.getKey(), combined_event);
    
            return return_kv;
        }
    }
    
    @SuppressWarnings("serial")
    public static void main(String[] args) throws SocketTimeoutException {
    
        Logger log = LoggerFactory.getLogger(StarterPipeline.class);
    
        log.info("Events pipeline job started");
    
        StarterPipelineOption options = PipelineOptionsFactory.fromArgs(args).withValidation()
        .as(StarterPipelineOption.class);
    
        Pipeline p = Pipeline.create(options);
    
        log.info("Pipeline created");
    
        log.info("Pipeline Started");
    
        PCollection<String> datastream = p.apply("Read Events From Pubsub",
            PubsubIO.readStrings().fromSubscription(Constants.pubsub_event_pipeline_url));
    
        // PCollection<String> windowed_items =
        // datastream.apply(Window.<String>into(FixedWindows.of(Duration.standardMinutes(1))));
    
        // PCollection<String> windowed_items = datastream.apply(
        // Window.<String>into(SlidingWindows.of(Duration.standardMinutes(1)).every(Duration.standardSeconds(10))));
    
        PCollection<String> windowed_items = datastream.apply(Window.<String>into(new GlobalWindows())
            .triggering(Repeatedly.forever(
                AfterProcessingTime.pastFirstElementInPane().plusDelayOf(Duration.standardSeconds(300))))
            .withAllowedLateness(Duration.standardDays(10)).discardingFiredPanes());
    
        // Write to storage
        windowed_items.apply("Read and make pipe separated event string", new ReadEventJson_storage())
        .apply("Combine events by keys", GroupByKey.<String, String>create())
        .apply("Combine events strings by event name", MapElements.via(new CombineEventStrings()))
        .apply("Manually write events to GCS", ParDo.of(new Write_to_GCS()));
    
        // Write into Big Query
        windowed_items.apply("Read and make event table row", new ReadEventJson_bigquery())
    
        .apply("Write_events_to_BQ",
            BigQueryIO.writeTableRows().to(new DynamicDestinations<TableRow, String>() {
                public String getDestination(ValueInSingleWindow<TableRow> element) {
                    String destination = EventSchemaBuilder
                    .fetch_destination_based_on_event(element.getValue().get("event").toString());
                    return destination;
                }
    
                @Override
                public TableDestination getTable(String table) {
                    String destination = EventSchemaBuilder.fetch_table_name_based_on_event(table);
                    return new TableDestination(destination, destination);
                }
    
                @Override
                public TableSchema getSchema(String table) {
                    TableSchema table_schema = EventSchemaBuilder.fetch_table_schema_based_on_event(table);
                    return table_schema;
                }
            }).withCreateDisposition(CreateDisposition.CREATE_NEVER)
            .withWriteDisposition(WriteDisposition.WRITE_APPEND)
            .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors())
            );
    
        p.run().waitUntilFinish();
    
        log.info("Events Pipeline Job Stopped");
    
    }
    
    }
    
    package purplle.datapipeline;
    导入静态java.nio.charset.StandardCharsets.UTF_8;
    导入java.net.SocketTimeoutException;
    导入java.time.LocalDateTime;
    导入java.time.ZoneId;
    导入org.apache.beam.sdk.Pipeline;
    导入org.apache.beam.sdk.io.TextIO;
    导入org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers;
    导入org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
    导入org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
    导入org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
    导入org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinations;
    导入org.apache.beam.sdk.io.gcp.bigquery.InsertRetryPolicy;
    导入org.apache.beam.sdk.io.gcp.bigquery.TableDestination;
    导入org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
    导入org.apache.beam.sdk.options.Default;
    导入org.apache.beam.sdk.options.Description;
    导入org.apache.beam.sdk.options.PipelineOptions;
    导入org.apache.beam.sdk.options.pipelineoptions工厂;
    导入org.apache.beam.sdk.transforms.DoFn;
    导入org.apache.beam.sdk.transforms.GroupByKey;
    导入org.apache.beam.sdk.transforms.MapElements;
    导入org.apache.beam.sdk.transforms.ptTransform;
    导入org.apache.beam.sdk.transforms.ParDo;
    导入org.apache.beam.sdk.transforms.SimpleFunction;
    导入org.apache.beam.sdk.transforms.windowing.AfterProcessingTime;
    导入org.apache.beam.sdk.transforms.windowing.GlobalWindows;
    重复导入org.apache.beam.sdk.transforms.windowing.com;
    导入org.apache.beam.sdk.transforms.windowing.Window;
    导入org.apache.beam.sdk.values.KV;
    导入org.apache.beam.sdk.values.PCollection;
    导入org.apache.beam.sdk.values.valuesinSingleWindow;
    导入org.joda.time.Duration;
    导入org.json.JSONException;
    导入org.json.JSONObject;
    导入org.slf4j.Logger;
    导入org.slf4j.LoggerFactory;
    导入com.google.api.services.bigquery.bigquery;
    导入com.google.api.services.bigquery.model.TableRow;
    导入com.google.api.services.bigquery.model.TableSchema;
    导入com.google.cloud.storage.Blob;
    导入com.google.cloud.storage.BlobId;
    导入com.google.cloud.storage.BlobInfo;
    导入com.google.cloud.storage.storage;
    导入com.google.cloud.storage.StorageOptions;
    导入purplle.datapipeline.buissness.EventSchemaBuilder;
    导入purplle.datapipeline.buissness.Ordering;
    导入purplle.datapipeline.common.Constants;
    导入purplle.datapipeline.helpers.Event_排序;
    导入purplle.datapipeline.helpers.Event_模式;
    导入purplle.datapipeline.helpers.JSON\u helper;
    公共类启动程序管道{
    公共接口启动器PipeLineOption扩展了PipelineOptions{
    /**
    *设置此必需选项以指定读取输入的位置。
    */
    @说明(“要从中读取的文件的路径”)
    @Default.String(Constants.pubsub\u event\u pipeline\u url)
    字符串getInputFile();
    void setInputFile(字符串值);
    }
    @抑制警告(“串行”)
    静态类ParseJsonData_存储扩展了DoFn{
    @过程元素
    public void processElement(ProcessContext c)抛出JSONException{
    Logger log=LoggerFactory.getLogger(StarterPipeline.class);
    if(c.element().length()>0&&JSON\u helper.isJSONValid(c.element())){
    JSONObject event_obj=新的JSONObject(c.element());
    if(event_obj.length()>0&&event_obj.has(“事件”)){
    JSONObject ob2=JSON_helper.flatJsonConvertKeyToLower(事件_obj);
    如果(ob2.length()>0&&ob2.has(“事件”)){
    //对json对象重新排序,然后传递以创建管道速率字符串。
    KV事件对=事件顺序。顺序事件列(ob2,“存储”);
    如果(!event_kv_pair.getKey().isEmpty()&&event_kv_pair.getKey().length()>0){
    c、 输出(事件对);
    }否则{
    log=LoggerFactory.getLogger(StarterPipeline.class);
    log.error(“存储字符串为空=“+c.element());
    }
    }否则{
    log=LoggerFactory.getLogger(StarterPipeline.class);
    log.error(“存储对象错误=“+c.element());
    }
    }否则{
    log=LoggerFactory.getLogger(StarterPipeline.class);
    log.error(“存储对象错误=“+c.element());
    }
    }否则{
    log=LoggerFactory.getLogger(StarterPipeline.class);
    log.error(“存储空元素=“+c.element());
    }
    }
    }
    @抑制警告(“串行”)
    静态类ParseJsonData_bigquery扩展了DoFn{
    @过程元素
    public void processElement(ProcessContext c)抛出JSONException{
    Logger log=LoggerFactory.getLogger(StarterPipeline.class);
    log.info(“事件json=“+c.element());
    if(!c.element().isEmpty()&&JSON_helper.isJSONValid(c.element())){
    JSONObject event_obj=新的JSONObject(c.element());
    if(event_obj.length()>0&&event_obj.has(“事件”)){
    JSONObject ob2=JSON_helper.flatJsonConvertKeyToLower(事件_obj);
    如果(ob2。