在Python中从多个文件中找到单词的最小值?
我有1000个.txt
文件,我需要在这些文件上运行一段代码。我的任务是找到这些文件中最大的ENSG值,然后删除其他小于这个最大值的ENSG值。接着,我还要根据utr长度找到同一个文本文件中的最小值,并把它保存到我的output.txt
文件里。我希望把这1000个.txt
文件的结果都放到一个output.txt
文件中。现在这段代码大致能工作,但它会覆盖之前的结果,只显示最后一个文件的结果:
import glob
f2 = glob.glob("./*.txt")
all_text=""
for fpath in f2:
f = open(fpath,"r")
list_lines = f.readlines()
dic={}
sent="ENSG"
temp_list=[]
for line in list_lines:
all_text=all_text+line
name= line.rsplit()[0].strip()
score=line.rsplit()[1].strip()
dic[name]=score
for i in dic.keys():
if sent in i:
temp_list.append(dic[i])
hiegh_score=max(temp_list)
def check(index):
reverse_text=all_text[index+1::-1]
index2=reverse_text.find("\n")
if sent==reverse_text[:index2+1][::-1][1:len(sent)+1]:
return False
else:
return True
list_to_min=dic.values()
for i in temp_list:
if i!=hiegh_score:
index=all_text.find(str(i))
while check(index):
index=all_text.find(str(i),index+len(str(i)))
all_text=all_text[0:index]+all_text[index+len(str(i)):]
list_to_min.remove(str(i))
file2=open("my_try4.txt","w")
file2.write(all_text)
min_score= min(list_to_min)
for j in dic.keys():
if min_score==dic[j]:
k="min score is :"+str(min_score)+" for person "+j
file2.write(k)
print "%6d : %s" % (len(list_lines),fpath)
file2.close()
f.close()
我有这样的文本文件,比如4.txt
:
ENSBTAG00000020679 197
ENSCAFG00000009872 2585
ENSG00000018236 1935
ENSG00000018236 230
ENSG00000018236 257
ENSG00000018236 338
ENSG00000018236 922
ENSG00000018236 922
ENSRNOG00000004438 14
ENSRNOG00000004438 14
现在它应该选择ENSG值为1935的,并删除其他所有ENSG值。现在这个文本文件应该看起来像这样:
ENSBTAG00000020679 197
ENSCAFG00000009872 2585
ENSG00000018236 1935
ENSRNOG00000004438 14
ENSRNOG00000004438 14
然后,通过查看这个文本文件,我们找到最短的值,并把它保存到一个文本文件里(我们在1000个文件上都这样做,最后的输出应该在一个文件中)。
output.txt
textfile4 14
1 个回答
1
重写这个代码比找出你代码的问题要简单多了:
import os.path
import glob
import re
import itertools
from collections import namedtuple, deque
from operator import attrgetter
R_PREFIX_VALUE = re.compile(r'^(?P<prefix>[A-Z]+)(?P<suffix>\d+)\s+(?P<value>\d+)\s*$')
getvalue = attrgetter('value')
def interleave(seq, val):
return itertools.chain.from_iterable(itertools.izip(seq, itertools.repeat(val)))
class Fileline(namedtuple('Fileline', 'filename prefix suffix value')):
@classmethod
def _fromstr(cls, s, filename=None, rematch=R_PREFIX_VALUE.match):
m = rematch(s)
if not m:
raise ValueError('No valid line found in %r' % s)
d = m.groupdict()
d['value'] = int(d['value'])
d['filename'] = filename
return cls(**d)
def _asstr(self):
return '{}{} {}'.format(self.prefix, self.suffix, self.value)
def max_value_with_prefix(lineseq, prefix, getvalue=getvalue):
withprefix = (line for line in lineseq if line.prefix==prefix)
return max_value(withprefix)
def filter_lt_line(lineseq, maxline):
for line in lineseq:
if line.prefix != maxline.prefix or line.value >= maxline.value:
yield line
def extreme_value(fn, lineseq, getvalue=getvalue):
try:
return fn((l for l in lineseq if l is not None), key=getvalue)
except ValueError:
return None
def max_value(lineseq):
return extreme_value(max, lineseq)
def min_value(lineseq):
return extreme_value(min, lineseq)
def read_lines(fn, maker=Fileline._fromstr):
with open(fn, 'rb') as f:
return deque(maker(l, fn) for l in f)
def write_file(fn, lineseq):
lines = (l._asstr() for l in lineseq)
newlines = interleave(lines, '\n')
with open(fn, 'wb') as f:
f.writelines(newlines)
def write_output_file(fn, lineseq):
lines = ("{} {}".format(l.filename, l.value) for l in lineseq)
newlines = interleave(lines, "\n")
with open(fn, 'wb') as f:
f.writelines(newlines)
def filter_max_returning_min(fn, prefix):
lineseq = read_lines(fn)
maxvalue = max_value_with_prefix(lineseq, prefix)
filteredlineseq = deque(filter_lt_line(lineseq, maxvalue))
write_file(fn, filteredlineseq)
minline = min_value(filteredlineseq)
return minline
def main(fileglob, prefix, outputfile):
minlines = []
for fn in glob.iglob(fileglob):
minlines.append(filter_max_returning_min(fn, prefix))
write_output_file(outputfile, minlines)
程序的入口是 main()
,它的调用方式是 main('txtdir', 'ENSG', 'output.txt')
。对于每个文件,filter_max_returning_min()
会打开并重写这个文件,同时返回最小值。你不需要保存你访问过的每个文件的每一行的字典或列表。
(顺便说一下,直接覆盖文件听起来不是个好主意!你有没有考虑把它们复制到别的地方?)
当你把不同的功能分开到不同的函数里时,重新组合它们以实现不同的执行效果就变得很简单。例如,通过添加两个小函数,你可以轻松地让这个任务在所有文件上并行运行:
def _worker(args):
return filter_max_returning_min(*args)
def multi_main(fileglob, prefix, outputfile, processes):
from multiprocessing import Pool
pool = Pool(processes=processes)
workerargs = ((fn, prefix) for fn in glob.iglob(fileglob))
minlines = pool.imap_unordered(_worker, workerargs, processes)
write_file(outputfile, minlines)
现在你可以启动一个可配置数量的工作进程,每个进程处理一个文件,完成后收集它们的最小值。如果你的文件非常大或者文件数量很多,并且不受输入输出限制,这样可能会更快。
为了好玩,你还可以轻松地把它变成一个命令行工具:
def _argparse():
import argparse
def positive_int(s):
v = int(s)
if v < 1:
raise argparse.ArgumentTypeError('{:r} must be a positive integer'.format(s))
return v
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description="""Filter text files and write min value.
Performs these operations on the text files in supplied `filedir`:
1. In each file, identify lines starting with the matching `maxprefix`
which do *not* contain the maximum value for that prefix in that file.
2. DESTRUCTIVELY REWRITE each file with lines found in step 1 removed!
3. Write the minimum value (for all lines in all files) to `outputfile`.
""")
parser.add_argument('filedir',
help="Directory containg the text files to process. WILL REWRITE FILES!")
parser.add_argument('maxprefix', nargs="?", default="ENSG",
help="Line prefix which should have values less than max value removed in each file")
parser.add_argument('outputfile', nargs="?", default="output.txt",
help="File in which to write min value found. WILL REWRITE FILES!")
parser.add_argument('-p', '--parallel', metavar="N", nargs="?", type=positive_int, const=10,
help="Process files in parallel, with N workers. Default is to process a file at a time.")
return parser.parse_args()
if __name__ == '__main__':
args = _argparse()
fileglob = os.path.join(args.filedir, '*.txt')
prefix = args.maxprefix
outputfile = args.outputfile
if args.parallel:
multi_main(fileglob, prefix, outputfile, args.parallel)
else:
main(fileglob, prefix, outputfile)
现在你可以从命令行调用它:
$ python ENSG.py txtdir ENSCAFG --parallel=4