Python使用字典和元组查找单词和字母的唯一计数

import sys import os os.getcwd() import string path = "" os.chdir(path) #Prompt for user to input filename: fname = input('Enter the filename: ') try: fhand = open(fname) except IOError: #Invalid filename error print('\n') print("Sorry, file can't be opened! Please check your spelling.") sys.exit() #Initialize char counts and word counts dictionary counts = {} worddict = {} #For character and word frequency count for line in fhand: #Remove leading spaces line = line.strip() #Convert everything in the string to lowercase line = line.lower() #Take into account punctuation line = line.translate(line.maketrans('', '', string.punctuation)) #Take into account white spaces line = line.translate(line.maketrans('', '', string.whitespace)) #Take into account digits line = line.translate(line.maketrans('', '', string.digits)) #Splitting line into words words = line.split(" ") for word in words: #Is the word already in the word dictionary? if word in worddict: #Increase by 1 worddict[word] += 1 else: #Add word to dictionary with count of 1 if not there already worddict[word] = 1 #Character count for word in line: #Increase count by 1 if letter if word in counts: counts[word] += 1 else: counts[word] = 1 #Initialize dictionaries lst = [] countlst = [] freqlst = [] #Count up the number of letters for ltrs, c in counts.items(): lst.append((c,ltrs)) countlst.append(c) #Sum up the count totalcount = sum(countlst) #Calculate the frequency in each dictionary for ec in countlst: efreq = (ec/totalcount) * 100 freqlst.append(efreq) #Sort lists by count and percentage frequency freqlst.sort(reverse=True) lst.sort(reverse=True) #Print out word counts for key in list(worddict.keys()): print(key, ":", worddict[key]) #Print out all letters and counts: for ltrs, c, in lst: print(c, '-', ltrs, '-', round(ltrs/totalcount*100, 2), '%')

butsoftwhatlightthroughyonderwindowbreaks : 1 itistheeastandjulietisthesun : 1 arisefairsunandkilltheenviousmoon : 1 whoisalreadysickandpalewithgrief : 1 i - 14 - 10.45 % t - 12 - 8.96 % e - 12 - 8.96 % s - 11 - 8.21 % a - 11 - 8.21 % n - 9 - 6.72 % h - 9 - 6.72 % o - 8 - 5.97 % r - 7 - 5.22 % u - 6 - 4.48 % l - 6 - 4.48 % d - 6 - 4.48 % w - 5 - 3.73 % k - 3 - 2.24 % g - 3 - 2.24 % f - 3 - 2.24 % y - 2 - 1.49 % b - 2 - 1.49 % v - 1 - 0.75 % p - 1 - 0.75 % m - 1 - 0.75 % j - 1 - 0.75 % c - 1 - 0.75 %

2条回答

网友

1楼 · 编辑于 2024-05-14 18:52:46

line = line.translate(line.maketrans('', '', string.whitespace))

您正在使用此代码删除行中的所有空格。移除它，它将按照您的意愿工作

网友

2楼 · 编辑于 2024-05-14 18:52:46

您的代码删除空格以便按空格分割–这没有意义。当您想从给定文本中提取每个单词时，我建议您将相邻的所有单词对齐，中间留一个空格–这意味着您不仅要删除新行、不必要的空格、特殊/不需要的字符和数字，还要删除控制字符

这应该可以做到：

import sys
import os

os.getcwd()
import string

path = "/your/path"
os.chdir(path)

# Prompt for user to input filename:
fname = input("Enter the filename: ")

try:
    fhand = open(fname)
except IOError:
    # Invalid filename error
    print("\n")
    print("Sorry, file can't be opened! Please check your spelling.")
    sys.exit()

# Initialize char counts and word counts dictionary
counts = {}
worddict = {}

# create one liner with undesired characters removed
text = fhand.read().replace("\n", " ").replace("\r", "")
text = text.lower()
text = text.translate(text.maketrans("", "", string.digits))
text = text.translate(text.maketrans("", "", string.punctuation))
text = " ".join(text.split())

words = text.split(" ")

for word in words:
    # Is the word already in the word dictionary?
    if word in worddict:
        # Increase by 1
        worddict[word] += 1
    else:
        # Add word to dictionary with count of 1 if not there already
        worddict[word] = 1

# Character count
for word in text:
    # Increase count by 1 if letter
    if word in counts:
        counts[word] += 1
    else:
        counts[word] = 1

# Initialize dictionaries
lst = []
countlst = []
freqlst = []

# Count up the number of letters
for ltrs, c in counts.items():
    # skip spaces
    if ltrs == " ":
        continue
    lst.append((c, ltrs))
    countlst.append(c)

# Sum up the count
totalcount = sum(countlst)

# Calculate the frequency in each dictionary
for ec in countlst:
    efreq = (ec / totalcount) * 100
    freqlst.append(efreq)

# Sort lists by count and percentage frequency
freqlst.sort(reverse=True)
lst.sort(reverse=True)

# Print out word counts sorted
for key in sorted(worddict.keys(), key=worddict.get, reverse=True)[:10]:
    print(key, ":", worddict[key])

# Print out all letters and counts:
for ltrs, c, in lst:
    print(c, "-", ltrs, "-", round(ltrs / totalcount * 100, 2), "%")

相关问题更多 >

编程相关推荐

热门问题

热门文章