使用Python根据公共字段合并多个*.csv、*.txt或*.ascii文件
我想把大约8个*.csv文件合并成一个。
一个示例文件是:
ID, Average
34, 4.5
35, 5.6
36, 3.4
另一个文件可能是:
ID, Max
34, 6
35, 7
36, 4
我需要的输出结果是:
ID, Average, Max
34, 4.5, 6
35, 5.6, 7
36, 3.4, 4
这只部分有效……它把所有数据都放到了同样的两列里。
import glob, string
outfile = open('<directory>/<fileName>.csv','a')
files = glob.glob(r"<directory>/*.csv")
for y in files:
newfile = open(y,'r+')
data = newfile.read()
newfile.close()
outfile.writerow(y)
我该怎么做才能把数据放到新的列里,而不是重复“ID”字段呢?
4 个回答
0
也许可以试试txtselect这个工具?我没用过,但作者下个月会在pyArkansas上做一个讲座。
1
data1 = ['1,blue,red',
'2,purple,yellow',
'3,white,brown']
data2 = ['1,fee',
'2,fie',
'3,foe',
'4,fum']
data_table = dict(s.split(',',1) for s in data1)
for line in data2:
key, _ = line.split(',',1)
print ','.join((line, data_table.get(key,',')))
给出的结果是:
1,fee,blue,red
2,fie,purple,yellow
3,foe,white,brown
4,fum,,
这里有一个csv格式的版本:
import csv
data1 = ['1,blue,red',
'2,purple,yellow',
'3,white,brown']
data2 = ['1,fee',
'2,fie',
'3,foe',
'4,fum']
with open('out.txt','w') as f:
combined = csv.writer(f)
data1 = ['1,blue,red',
'2,purple,yellow',
'3,white,brown']
data2 = ['1,fee',
'2,fie',
'3,foe',
'4,fum']
data_table = dict((row[0], row[1:]) for row in csv.reader(data1))
for row in csv.reader(data2):
key = row[0]
combined.writerow(row + data_table.get(key, ['','']))
4
你这里有三个问题。
- 读取每一个csv文件
- 在一个共同的字段上合并数据
- 把合并后的数据写入一个新的csv文件
代码
#!/usr/bin/env python
import argparse, csv
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='merge csv files on field', version='%(prog)s 1.0')
parser.add_argument('infile', nargs='+', type=str, help='list of input files')
parser.add_argument('--out', type=str, default='temp.csv', help='name of output file')
args = parser.parse_args()
data = {}
fields = []
for fname in args.infile:
with open(fname, 'rb') as df:
reader = csv.DictReader(df)
for line in reader:
# assuming the field is called ID
if line['ID'] not in data:
data[line['ID']] = line
else:
for k,v in line.iteritems():
if k not in data[line['ID']]:
data[line['ID']][k] = v
for k in line.iterkeys():
if k not in fields:
fields.append(k)
del reader
writer = csv.DictWriter(open(args.out, "wb"), fields, dialect='excel')
# write the header at the top of the file
writer.writeheader()
writer.writerows(data)
del writer
请注意,这样做会忽略那些字段名完全相同的数据。
解析器部分的一个替代方案是:
#!/usr/bin/env python
import glob, csv
if __name__ == '__main__':
infiles = glob.glob('./*.csv')
out = 'temp.csv'
data = {}
fields = []
for fname in infiles:
df = open(fname, 'rb')
reader = csv.DictReader(df)
for line in reader:
# assuming the field is called ID
if line['ID'] not in data:
data[line['ID']] = line
else:
for k,v in line.iteritems():
if k not in data[line['ID']]:
data[line['ID']][k] = v
for k in line.iterkeys():
if k not in fields:
fields.append(k)
del reader
df.close()
writer = csv.DictWriter(open(out, "wb"), fields, dialect='excel')
# write the header at the top of the file
writer.writeheader()
writer.writerows(data)
del writer