词频计数奇怪结果

def toptenwords(mycorpus): mywords = mycorpus.words() nocapitals = [word.lower() for word in mywords] filtered = [word for word in nocapitals if word not in stoplist] nopunctuation= [s.translate(None, string.punctuation) for s in filtered] woordcounter = {} for word in nopunctuation: if word in wordcounter: woordcounter[word] += 1 else: woordcounter[word] = 1 frequentwords = sorted(wordcounter.iteritems(), key = itemgetter(1), reverse = True) top10 = frequentwords[:10] woord1 = frequentwords[1] woord2 = frequentwords[2] woord3 = frequentwords[3] woord4 = frequentwords[4] woord5 = frequentwords[5] woord6 = frequentwords[6] woord7 = frequentwords[7] woord8 = frequentwords[8] woord9 = frequentwords[9] woord10 = frequentwords[10] print "De 10 meest frequente woorden zijn: ", woord1, ",", woord2, ",", woord3, ",", woord4, ",", woord5, ",", woord6, ",", woord7, ",", woord8, ",", woord9, "en", woord10

1条回答

网友

1楼 · 发布于 2024-04-20 13:05:40

使用collections.Counter。它非常适合计算（散列）项的频率，并且它有一个most_common方法，可以返回前十个最频繁的项，而无需您自己编写逻辑：

import string
import collections


def topNwords(mywords, N = 10, stoplist = set(), filtered = set()):
    # mywords = mycorpus.words()
    nocapitals = [word.lower() for word in mywords]
    filtered = [word for word in nocapitals if word not in stoplist]
    nopunctuation = [s.translate(None, string.punctuation) for s in filtered]
    woordcounter = collections.Counter(nopunctuation)
    top_ten = [word for word, freq in woordcounter.most_common(N)]
    return top_ten


top_ten = topNwords('This is a test. It is only a test. In case of a real emergency'.split(), N = 10)
print("De 10 meest frequente woorden zijn: {w}".format(w = ', '.join(top_ten)))

相关问题更多 >

编程相关推荐

热门问题

热门文章