我要做的是从杂乱的文本文件中提取一些特定的单词,这些单词有时拼写错误或者不属于我的字符。我已经能够在一个目录中的多个文件中完成单个单词的精确拼写,这很接近,但并不完全是我想要的。最后一件事是,我想将这个包含单词和短语计数的列表保存到一个文本文件中,而不仅仅是将其打印为摘要,这正是我的代码现在所做的
如果不可能找到相近的匹配项,那没关系,但那将是理想的
谢谢你的帮助
import os
from collections import Counter
import glob
def word_frequency(fileobj, words):
"""Build a Counter of specified words in fileobj"""
# initialise the counter to 0 for each word
ct = Counter(dict((w, 0) for w in words))
file_words = (word for line in fileobj for word in line.split())
filtered_words = (word for word in file_words if word in words)
return Counter(filtered_words)
def count_words_in_dir(dirpath, words, action=None):
"""For each .txt file in a dir, count the specified words"""
for filepath in glob.iglob(os.path.join(path, '*.txt')):
with open(filepath) as f:
ct = word_frequency(f, words)
if action:
action(filepath, ct)
def print_summary(filename, ct):
words = sorted(ct.keys())
counts = [str(ct[k]) for k in words]
print('{0}\n{1}\n{2}\n\n'.format(
filepath,
', '.join(words),
', '.join(counts)))
words = set(['JUSTICE', "policy payment", "payment", "annuity", "CYNTHEA" ])
count_words_in_dir('./', words, action=print_summary)
相关问题 更多 >
编程相关推荐