Python 读取一个csv文件，清理它，然后使用ApacheBeamDataflow将结果作为csv写出_Python_Python 3.x_Google Cloud Platform_Google Cloud Dataflow_Apache Beam

Python 读取一个csv文件，清理它，然后使用ApacheBeamDataflow将结果作为csv写出

python python-3.x google-cloud-platform google-cloud-dataflow

Python 读取一个csv文件，清理它，然后使用ApacheBeamDataflow将结果作为csv写出,python,python-3.x,google-cloud-platform,google-cloud-dataflow,apache-beam,Python,Python 3.x,Google Cloud Platform,Google Cloud Dataflow,Apache Beam,我想读取一个csv文件，清理它，然后使用ApacheBeamDataflow将结果写成csv。其目的是使文件可加载到BigQuery中。清理规则是简单地用双引号转义双引号。我的清洁规则有效。我很难将它整合到管道中。我正在寻求关于我的清洗功能应该返回什么以及如何通过管道调用它的建议 import apache_beam as beam import csv import logging from apache_beam.options.pipeline_options import Pipeli

我想读取一个csv文件，清理它，然后使用ApacheBeamDataflow将结果写成csv。其目的是使文件可加载到BigQuery中。清理规则是简单地用双引号转义双引号。我的清洁规则有效。我很难将它整合到管道中。我正在寻求关于我的清洗功能应该返回什么以及如何通过管道调用它的建议

import apache_beam as beam
import csv
import logging
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io import WriteToText

lines = p | ReadFromText(file_pattern="gs://dev/clean_input/input01.csv")

def parse_method(line):
    
    CSV_PARSING_KWARGS = {
        'doublequote': True,
        'escapechar': '\\',
        'quotechar': '"',
        'delimiter': ','
    }

    reader = csv.reader(csv_file, CSV_PARSING_KWARGS)
    for rec in reader:
        cw = csv.writer(out_file, escapechar='"', quoting=csv.QUOTE_MINIMAL)
        cw.writerow(rec)
        return rec
        

def run(region, project, bucket, temploc ):
    argv = [
           # Passed in args 
           '--region={}'.format(region),
           '--project={}'.format(project),
           '--temp_location={}'.format(temploc),
           # Constructs
           '--staging_location=gs://{}/clean_input/stg/'.format(bucket),
           # Mandatory constants
           '--job_name=cleammycsv',
           '--runner=DataflowRunner'     
           ]
      
    options = PipelineOptions(
    flags=argv
    )

    pipeline = beam.Pipeline(options=options)
  
    clean_csv = (pipeline
    lines = lines| 'Read' >> beam.Map(parse_method)
    line = lines | 'Output to file' >> WriteToText(file_pattern="gs://dev/clean_output/output_file.csv")
    )   
    pipeline.run()

if __name__ == '__main__':
   import argparse
   
   # Create the parser  
   parser = argparse.ArgumentParser(description='Run the CSV cleaning pipeline')   

   parser.add_argument('-r','--region', help='Region ID where data flow job to run', default='australia-southeast1')
   parser.add_argument('-p','--project', help='Unique project ID', required=True)
   parser.add_argument('-b','--bucket', help='Bucket name', required=True)
   parser.add_argument('-t','--temploc', help='Bucket name and folder', required=True)
   
   # Execute the parse_args() method
   args = vars(parser.parse_args())

   run(project=args['project'], bucket=args['bucket'], region=args['region'],temploc=args['temploc'])

我终于找到了一种能起作用的方法

import apache_beam as beam
import csv
import logging
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io import WriteToText


def parse_file(element):
  for line in csv.reader([element], quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL):
      line = [s.replace('\"', '') for s in line]
      clean_line = '","'.join(line)
      final_line = '"'+ clean_line +'"'
      return final_line



def run(region, project, bucket, temploc ):
    argv = [
           # Passed in args 
           '--region={}'.format(region),
           '--project={}'.format(project),
           '--temp_location={}'.format(temploc),
           # Constructs
           '--staging_location=gs://{}/clean_input/stg/'.format(bucket),
       # Mandatory constants
           '--job_name=cleammycsv',
           '--runner=DataflowRunner'     
          ]
    filename_in = 'gs://{}/clean_input/IN_FILE.csv'.format(bucket)
    files_output = 'gs://{}/clean_output/OUT_FILE.csv'.format(bucket)
    
    options = PipelineOptions(
    flags=argv
    )

    pipeline = beam.Pipeline(options=options)
   

    clean_csv = (pipeline 
    | 'Read input file' >> beam.io.ReadFromText(filename_in)
    | 'Parse file' >> beam.Map(parse_file)
    | 'writecsv' >> beam.io.WriteToText(files_output,num_shards=10)
   )
   
    pipeline.run()

if __name__ == '__main__':
   import argparse
   
   # Create the parser  
   parser = argparse.ArgumentParser(description='Run the CSV cleaning pipeline')   

   parser.add_argument('-r','--region', help='Region ID where data flow job to run', required=True)
   parser.add_argument('-p','--project', help='Unique project ID', required=True)
   parser.add_argument('-b','--bucket', help='Bucket name', required=True)
   parser.add_argument('-t','--temploc', help='Bucket name and folder', required=True)
   
   # Execute the parse_args() method
   args = vars(parser.parse_args())

   run(project=args['project'], bucket=args['bucket'], region=args['region'],temploc=args['temploc'])

当前的

parse_方法

user函数返回什么？parse函数写出一个文件。如果数据字段中有双引号，它会添加一个双引号，并引用该字段，例如“mystring”“data”。在加载到BigQuery之前，您只需删除双引号？