词的词频和文档频率

import collections import os.path import glob import nltk wdict = set() path = "C://Python27//Corpus Files//*.*" #this function cleans up a doc (removes stopwords etc) def cleanDoc(doc): stopset = set(nltk.corpus.stopwords.words('english')) stemmer = nltk.PorterStemmer() tokens = nltk.WordPunctTokenizer().tokenize(doc) clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 3 and token.isalpha()] final = [stemmer.stem(word) for word in clean] return final for text in glob.glob(path): f = open(text) data= f.read() words = cleanDoc(data) wdict.update(words)

1条回答

网友

1楼 · 发布于 2024-04-20 10:36:46

您可以使用FreqDist对象，fromnltk.probability来计算这些单词。稍后，您可以使用dict类的键值接口和方法（如freq.items()或{}）在其中导航，甚至可以使用matplotlib绘制结果。在

import collections
import os.path
import glob
import nltk
from nltk.probability import FreqDist


term_frequency = {}

path = "C://Python27//Corpus Files//*.*"

#this function cleans up a doc (removes stopwords etc)
def cleanDoc(doc):
    stopset = set(nltk.corpus.stopwords.words('english'))
    stemmer = nltk.PorterStemmer()
    tokens = nltk.WordPunctTokenizer().tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 3 and token.isalpha()]
    final = [stemmer.stem(word) for word in clean]
    return final

for text in glob.glob(path):
    f = open(text)
    data = f.read()
    words = cleanDoc(data)
    numbers_of_words = len(words)
    freq = FreqDist(all_words)
    # term_frequency is a dict which structure is like:
    # {
    #     'path_to_file': 
    #         {'term': 13.4, 'another_term': 15}, 
    #     'another_file': 
    #         {'term2': 12, 'foo': 15}
    #  } 
    for term in freq.keys():
        if isintance(term_frequency[text], dict):
            term_frequency[text][term] = freq[term]/numbers_of_words
        else:
            term_frequency[text] = {term: freq[term]/numbers_of_words}

参考号：https://nltk.googlecode.com/svn/trunk/doc/api/nltk.probability.FreqDist-class.html

相关问题更多 >

编程相关推荐

热门问题

热门文章