Python查找可能拼写错误的单词频率和字符串频率,并保存为txt文件或CSV

2024-04-25 04:13:11 发布

您现在位置:Python中文网/ 问答频道 /正文

我要做的是从杂乱的文本文件中提取一些特定的单词,这些单词有时拼写错误或者不属于我的字符。我已经能够在一个目录中的多个文件中完成单个单词的精确拼写,这很接近,但并不完全是我想要的。最后一件事是,我想将这个包含单词和短语计数的列表保存到一个文本文件中,而不仅仅是将其打印为摘要,这正是我的代码现在所做的

如果不可能找到相近的匹配项,那没关系,但那将是理想的

谢谢你的帮助

import os
from collections import Counter
import glob

def word_frequency(fileobj, words):
    """Build a Counter of specified words in fileobj"""
    # initialise the counter to 0 for each word
    ct = Counter(dict((w, 0) for w in words))
    file_words = (word for line in fileobj for word in line.split())
    filtered_words = (word for word in file_words if word in words)
    return Counter(filtered_words)


def count_words_in_dir(dirpath, words, action=None):
    """For each .txt file in a dir, count the specified words"""
    for filepath in glob.iglob(os.path.join(path, '*.txt')):
        with open(filepath) as f:
            ct = word_frequency(f, words)
            if action:
                action(filepath, ct)


def print_summary(filename, ct):
    words = sorted(ct.keys())
    counts = [str(ct[k]) for k in words]
    print('{0}\n{1}\n{2}\n\n'.format(
        filepath,
        ', '.join(words),
        ', '.join(counts)))


words = set(['JUSTICE', "policy payment", "payment", "annuity", "CYNTHEA" ])
count_words_in_dir('./', words, action=print_summary)

Tags: inimportfordefcountdircounteraction
1条回答
网友
1楼 · 发布于 2024-04-25 04:13:11
import sys
import os
from collections import Counter
import glob
# def count_words_in_dir(dirpath, words, action=None):
#     """For each .txt file in a dir, count the specified words"""
#     for filepath in glob.iglob(os.path.join(path, '*.txt')):
#         with open(filepath) as f:
#             data = f.read()
#             for key,val in words.items():
#                 print("key is " + key + "\n")
#                 ct = data.count(key)
#                 words[key] = ct
#             if action:
#                 action(filepath, ct)
stdoutOrigin=sys.stdout 
sys.stdout = open("log.txt", "w")
              
def count_words_in_dir(dirpath, words, action=None):
    for filepath in glob.iglob(os.path.join("path", '*.txt')):
        with open(filepath) as f:
            data = f.read()
            for key,val in words.items():
                #print("key is " + key + "\n")
                ct = data.count(key)
                words[key] = ct
            if action:
                 action(filepath, words)


def print_summary(filepath, words):
    print(filepath)
    for key,val in sorted(words.items()):
        print('{0}:\t{1}'.format(
            key,
            val))




filepath = sys.argv[1]
keys = ["keyword",
"keyword"]
words = dict.fromkeys(keys,0)

count_words_in_dir(filepath, words, action=print_summary)

sys.stdout.close()
sys.stdout=stdoutOrigin

相关问题 更多 >