我想读取一个csv文件,清理它,然后使用ApacheBeamDataflow将结果写成csv。其目的是使文件可加载到BigQuery中。清理规则是简单地用双引号转义双引号。 我的清洁规则有效。我很难将它整合到管道中。我正在寻求关于我的清洗功能应该返回什么以及如何通过管道调用它的建议

import apache_beam as beam
import csv
import logging
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io import WriteToText

lines = p | ReadFromText(file_pattern="gs://dev/clean_input/input01.csv")

def parse_method(line):
        'doublequote': True,
        'escapechar': '\\',
        'quotechar': '"',
        'delimiter': ','

    reader = csv.reader(csv_file, CSV_PARSING_KWARGS)
    for rec in reader:
        cw = csv.writer(out_file, escapechar='"', quoting=csv.QUOTE_MINIMAL)
        return rec

def run(region, project, bucket, temploc ):
    argv = [
           # Passed in args 
           # Constructs
           # Mandatory constants
    options = PipelineOptions(

    pipeline = beam.Pipeline(options=options)
    clean_csv = (pipeline
    lines = lines| 'Read' >> beam.Map(parse_method)
    line = lines | 'Output to file' >> WriteToText(file_pattern="gs://dev/clean_output/output_file.csv")

if __name__ == '__main__':
   import argparse
   # Create the parser  
   parser = argparse.ArgumentParser(description='Run the CSV cleaning pipeline')   

   parser.add_argument('-r','--region', help='Region ID where data flow job to run', default='australia-southeast1')
   parser.add_argument('-p','--project', help='Unique project ID', required=True)
   parser.add_argument('-b','--bucket', help='Bucket name', required=True)
   parser.add_argument('-t','--temploc', help='Bucket name and folder', required=True)
   # Execute the parse_args() method
   args = vars(parser.parse_args())

   run(project=args['project'], bucket=args['bucket'], region=args['region'],temploc=args['temploc'])

import apache_beam as beam
import csv
import logging
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io import WriteToText

def parse_file(element):
  for line in csv.reader([element], quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL):
      line = [s.replace('\"', '') for s in line]
      clean_line = '","'.join(line)
      final_line = '"'+ clean_line +'"'
      return final_line

def run(region, project, bucket, temploc ):
    argv = [
           # Passed in args 
           ' region={}'.format(region),
           ' project={}'.format(project),
           ' temp_location={}'.format(temploc),
           # Constructs
           ' staging_location=gs://{}/clean_input/stg/'.format(bucket),
       # Mandatory constants
           ' job_name=cleammycsv',
           ' runner=DataflowRunner'     
    filename_in = 'gs://{}/clean_input/IN_FILE.csv'.format(bucket)
    files_output = 'gs://{}/clean_output/OUT_FILE.csv'.format(bucket)
    options = PipelineOptions(

    pipeline = beam.Pipeline(options=options)

    clean_csv = (pipeline 
    | 'Read input file' >> beam.io.ReadFromText(filename_in)
    | 'Parse file' >> beam.Map(parse_file)
    | 'writecsv' >> beam.io.WriteToText(files_output,num_shards=10)

if __name__ == '__main__':
   import argparse
   # Create the parser  
   parser = argparse.ArgumentParser(description='Run the CSV cleaning pipeline')   

   parser.add_argument('-r',' region', help='Region ID where data flow job to run', required=True)
   parser.add_argument('-p',' project', help='Unique project ID', required=True)
   parser.add_argument('-b',' bucket', help='Bucket name', required=True)
   parser.add_argument('-t',' temploc', help='Bucket name and folder', required=True)
   # Execute the parse_args() method
   args = vars(parser.parse_args())

   run(project=args['project'], bucket=args['bucket'], region=args['region'],temploc=args['temploc'])

