在.txt文件中找到最常用单词的Python程序必须打印word及其连接

def countWords(lines): wordDict = {} for line in lines: wordList = lines.split() for word in wordList: if word in wordDict: wordDict[word] += 1 else: wordDict[word] = 1 return wordDict

from __future__ import division inputFileName = 'gb.txt' def readfile(fname): f = open(fname, 'r') s = f.read() f.close() return s.lower() def countChars(t): charDict = {} for char in t: if char in charDict: charDict[char] += 1 else: charDict[char] = 1 return charDict def findMostCommon(charDict): mostFreq = '' mostFreqCount = 0 for k in charDict: if charDict[k] > mostFreqCount: mostFreqCount = charDict[k] mostFreq = k return mostFreq def printCounts(charDict): for k in charDict: #First, handle some chars that don't show up very well when they print if k == '\n': print '\\n', charDict[k] #newline elif k == ' ': print 'space', charDict[k] elif k == '\t': print '\\t', charDict[k] #tab else: print k, charDict[k] #Normal character - print it with its count def printAlphabetically(charDict): keyList = charDict.keys() keyList.sort() for k in keyList: #First, handle some chars that don't show up very well when they print if k == '\n': print '\\n', charDict[k] #newline elif k == ' ': print 'space', charDict[k] elif k == '\t': print '\\t', charDict[k] #tab else: print k, charDict[k] #Normal character - print it with its count def printByFreq(charDict): aList = [] for k in charDict: aList.append([charDict[k], k]) aList.sort() #Sort into ascending order aList.reverse() #Put in descending order for item in aList: #First, handle some chars that don't show up very well when they print if item[1] == '\n': print '\\n', item[0] #newline elif item[1] == ' ': print 'space', item[0] elif item[1] == '\t': print '\\t', item[0] #tab else: print item[1], item[0] #Normal character - print it with its count def main(): text = readfile(inputFileName) charCounts = countChars(text) mostCommon = findMostCommon(charCounts) #print mostCommon + ':', charCounts[mostCommon] #printCounts(charCounts) #printAlphabetically(charCounts) printByFreq(charCounts) main()

3条回答

网友

1楼 · 编辑于 2024-05-16 17:48:15

这个程序实际上是一个4行程序，如果您使用您可以使用的强大工具：

with open(yourfile) as f:
    text = f.read()

words = re.compile(r"[\w']+", re.U).findall(text)   # re.U == re.UNICODE
counts = collections.Counter(words)

正则表达式将找到所有单词，与它们相邻的标点符号无关（但将撇号作为单词的一部分计算）。

计数器的作用几乎与字典一样，但您可以执行诸如counts.most_common(10)、添加计数等操作。请参见help(Counter)

我还建议您不要使用函数printBy...，因为只有没有副作用的函数才易于重用。

def countsSortedAlphabetically(counter, **kw):
    return sorted(counter.items(), **kw)

#def countsSortedNumerically(counter, **kw):
#    return sorted(counter.items(), key=lambda x:x[1], **kw)
#### use counter.most_common(n) instead

# `from pprint import pprint as pp` is also useful
def printByLine(tuples):
    print( '\n'.join(' '.join(map(str,t)) for t in tuples) )

演示：

>>> words = Counter(['test','is','a','test'])
>>> printByLine( countsSortedAlphabetically(words, reverse=True) )
test 2
is 1
a 1

编辑地址Mateusz Konieczny的注释：将[a-zA-Z']替换为[\w']。。。根据python文档，character类“匹配Unicode单词字符；这包括任何语言中可以作为单词一部分的大多数字符，以及数字和下划线。如果使用ASCII标志，则只有[a-zA-Z0-9_u]匹配。“（。。。但显然与撇号不匹配……）然而，w包括和0-9，因此如果不需要它们并且不使用unicode，则可以使用[a-zA-Z']；如果使用unicode，则需要执行否定断言或从字符类中减去[0-9]

网友

2楼 · 编辑于 2024-05-16 17:48:15

如果你需要在一篇文章中计算一些单词，那么最好使用regex。

让我们从一个简单的例子开始：

import re

my_string = "Wow! Is this true? Really!?!? This is crazy!"

words = re.findall(r'\w+', my_string) #This finds words in the document

结果：

>>> words
['Wow', 'Is', 'this', 'true', 'Really', 'This', 'is', 'crazy']

注意“是”和“是”是两个不同的词。我的猜测是你想让他们计数一样，所以我们可以把所有的单词大写，然后计数。

from collections import Counter

cap_words = [word.upper() for word in words] #capitalizes all the words

word_counts = Counter(cap_words) #counts the number each time a word appears

结果：

>>> word_counts
Counter({'THIS': 2, 'IS': 2, 'CRAZY': 1, 'WOW': 1, 'TRUE': 1, 'REALLY': 1})

你在这能行吗？

现在我们需要做和上面一样的事情，这次我们读的是一个文件。

import re
from collections import Counter

with open('your_file.txt') as f:
    passage = f.read()

words = re.findall(r'\w+', passage)

cap_words = [word.upper() for word in words]

word_counts = Counter(cap_words)

网友

3楼 · 编辑于 2024-05-16 17:48:15

~~你有一个简单的排版，words在你想要的地方word。~~

编辑：您似乎已经编辑了源。请使用“复制粘贴”功能，以便在第一次使用时正确完成。

编辑2:显然你不是唯一一个容易出错的人。真正的问题是你有lines你想要的line。很抱歉，我指责你编辑了源代码。

相关问题更多 >

编程相关推荐

热门问题

热门文章