读取一个csv文件，清理它，然后使用ApacheBeamDataflow将结果作为csv写出

import apache_beam as beam import csv import logging from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.io import WriteToText lines = p | ReadFromText(file_pattern="gs://dev/clean_input/input01.csv") def parse_method(line): CSV_PARSING_KWARGS = { 'doublequote': True, 'escapechar': '\\', 'quotechar': '"', 'delimiter': ',' } reader = csv.reader(csv_file, CSV_PARSING_KWARGS) for rec in reader: cw = csv.writer(out_file, escapechar='"', quoting=csv.QUOTE_MINIMAL) cw.writerow(rec) return rec def run(region, project, bucket, temploc ): argv = [ # Passed in args '--region={}'.format(region), '--project={}'.format(project), '--temp_location={}'.format(temploc), # Constructs '--staging_location=gs://{}/clean_input/stg/'.format(bucket), # Mandatory constants '--job_name=cleammycsv', '--runner=DataflowRunner' ] options = PipelineOptions( flags=argv ) pipeline = beam.Pipeline(options=options) clean_csv = (pipeline lines = lines| 'Read' >> beam.Map(parse_method) line = lines | 'Output to file' >> WriteToText(file_pattern="gs://dev/clean_output/output_file.csv") ) pipeline.run() if __name__ == '__main__': import argparse # Create the parser parser = argparse.ArgumentParser(description='Run the CSV cleaning pipeline') parser.add_argument('-r','--region', help='Region ID where data flow job to run', default='australia-southeast1') parser.add_argument('-p','--project', help='Unique project ID', required=True) parser.add_argument('-b','--bucket', help='Bucket name', required=True) parser.add_argument('-t','--temploc', help='Bucket name and folder', required=True) # Execute the parse_args() method args = vars(parser.parse_args()) run(project=args['project'], bucket=args['bucket'], region=args['region'],temploc=args['temploc'])

1条回答

网友

1楼 · 发布于 2024-04-20 04:44:17

我终于找到了一种能起作用的方法

import apache_beam as beam
import csv
import logging
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io import WriteToText


def parse_file(element):
  for line in csv.reader([element], quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL):
      line = [s.replace('\"', '') for s in line]
      clean_line = '","'.join(line)
      final_line = '"'+ clean_line +'"'
      return final_line



def run(region, project, bucket, temploc ):
    argv = [
           # Passed in args 
           ' region={}'.format(region),
           ' project={}'.format(project),
           ' temp_location={}'.format(temploc),
           # Constructs
           ' staging_location=gs://{}/clean_input/stg/'.format(bucket),
       # Mandatory constants
           ' job_name=cleammycsv',
           ' runner=DataflowRunner'     
          ]
    filename_in = 'gs://{}/clean_input/IN_FILE.csv'.format(bucket)
    files_output = 'gs://{}/clean_output/OUT_FILE.csv'.format(bucket)
    
    options = PipelineOptions(
    flags=argv
    )

    pipeline = beam.Pipeline(options=options)
   

    clean_csv = (pipeline 
    | 'Read input file' >> beam.io.ReadFromText(filename_in)
    | 'Parse file' >> beam.Map(parse_file)
    | 'writecsv' >> beam.io.WriteToText(files_output,num_shards=10)
   )
   
    pipeline.run()

if __name__ == '__main__':
   import argparse
   
   # Create the parser  
   parser = argparse.ArgumentParser(description='Run the CSV cleaning pipeline')   

   parser.add_argument('-r',' region', help='Region ID where data flow job to run', required=True)
   parser.add_argument('-p',' project', help='Unique project ID', required=True)
   parser.add_argument('-b',' bucket', help='Bucket name', required=True)
   parser.add_argument('-t',' temploc', help='Bucket name and folder', required=True)
   
   # Execute the parse_args() method
   args = vars(parser.parse_args())

   run(project=args['project'], bucket=args['bucket'], region=args['region'],temploc=args['temploc'])

相关问题更多 >

编程相关推荐

热门问题

热门文章