
2024-06-16

[{'mississippi': 1, 'worth': 1, 'reading': 1}, {'commonplace': 1, 'river': 1, 'contrary': 1, 'ways': 1, 'remarkable': 1}, {'considering': 1, 'missouri': 1, 'main': 1, 'branch': 1, 'longest': 1, 'river': 1, 'world--four': 1}, {'seems': 1, 'safe': 1, 'crookedest': 1, 'river': 1, 'part': 1, 'journey': 1, 'uses': 1, 'cover': 1, 'ground': 1, 'crow': 1, 'fly': 1, 'six': 1, 'seventy-five': 1}, {'discharges': 1, 'water': 1, 'st': 1}, {'lawrence': 1, 'twenty-five': 1, 'rhine': 1, 'thirty-eight': 1, 'thames': 1}, {'river': 1, 'vast': 1, 'drainage-basin:': 1, 'draws': 1, 'water': 1, 'supply': 1, 'twenty-eight': 1, 'states': 1, 'territories': 1, 'delaware': 1, 'atlantic': 1, 'seaboard': 1, 'country': 1, 'idaho': 1, 'pacific': 1, 'slope--a': 1, 'spread': 1, 'forty-five': 1, 'degrees': 1, 'longitude': 1}, {'mississippi': 1, 'receives': 1, 'carries': 1, 'gulf': 1, 'water': 1, 'fifty-four': 1, 'subordinate': 1, 'rivers': 1, 'navigable': 1, 'steamboats': 1, 'hundreds': 1, 'flats': 1, 'keels': 1}, {'area': 1, 'drainage-basin': 1, 'combined': 1, 'areas': 1, 'england': 1, 'wales': 1, 'scotland': 1, 'ireland': 1, 'france': 1, 'spain': 1, 'portugal': 1, 'germany': 1, 'austria': 1, 'italy': 1, 'turkey': 1, 'almost': 1, 'wide': 1, 'region': 1, 'fertile': 1, 'mississippi': 1, 'valley': 1, 'proper': 1, 'exceptionally': 1}]




file = ('sample.txt', 'r') 
file_1 = ('common.txt', 'r')
dict= {}
Orginal_data = file.read().split()
Common_data = file_1.read(). split ()

for char in ',;\n': 
    data = data.replace(char,' ') 

for i in data:
     for j in C_data: 
          if i != j:
          for k in data:
              if i ==k:
                  dict={ i : Value } # This line helps to count the appearance
print dict


  1. 将您的common.txt读入set以快速查找。在
  2. 阅读你的sample.txt并在.上分开,得到单独的句子。在
  3. 清除所有非单词字符(您必须定义它们或使用regex \b来捕获单词边界)并用空格替换它们。在
  4. 按空格分割并计算步骤1中set中不存在的单词。在


import collections

with open("common.txt", "r") as f:  # open the `common.txt` for reading
    common_words = {l.strip().lower() for l in f}  # read each line and and add it to a set

interpunction = ";,'\""  # define word separating characters and create a translation table
trans_table = str.maketrans(interpunction, " " * len(interpunction))

sentences_counter = []  # a list to hold a word count for each sentence
with open("sample.txt", "r") as f:  # open the `sample.txt` for reading
    # read the whole file to include linebreaks and split on `.` to get individual sentences
    sentences = [s for s in f.read().split(".") if s.strip()]  # ignore empty sentences
    for sentence in sentences:  # iterate over each sentence
        sentence = sentence.translate(trans_table)  # replace the interpunction with spaces
        word_counter = collections.defaultdict(int)  # a string:int default dict for counting
        for word in sentence.split():  # split the sentence and iterate over the words
            if word.lower() not in common_words:  # count only words not in the common.txt
                word_counter[word.lower()] += 1
        sentences_counter.append(word_counter)  # add the current sentence word count





Sentence #1:
    area: 1
    drainage-basin: 1
    great: 1
    combined: 1
    areas: 1
    england: 1
    wales: 1
    wide: 1
    region: 1
    fertile: 1
Sentence #2:
    mississippi: 1
    valley: 1
    proper: 1
    exceptionally: 1



for i, v in enumerate(sentences_counter):
    print("Sentence #{}:".format(i+1))
    for word, count in v.items():
        print("\t{} {}".format(word, count))
        print("\n".join("\t\t{}: {}".format(w, c) for w, c in v.items() if w != word))


更新2:如果您愿意,您不必为common_words使用set。在本例中,它几乎可以与list互换,因此您可以使用list comprehension而不是{a3}(即用方括号替换curly),但是查看list是一个O(n)操作,而set查找是{}操作,因此这里首选set。更不用说自动重复数据消除在common.txt有重复字时的附带好处。在


with open("common.txt", "r") as f:  # open the `common.txt` for reading
    common_words = {l.strip().lower() for l in f}  # read each line and and add it to a set

interpunction = ";,'\""  # define word separating characters and create a translation table
trans_table = str.maketrans(interpunction, " " * len(interpunction))

sentences_counter = []  # a list to hold a word count for each sentence
with open("sample.txt", "r") as f:  # open the `sample.txt` for reading
    # read the whole file to include linebreaks and split on `.` to get individual sentences
    sentences = [s for s in f.read().split(".") if s.strip()]  # ignore empty sentences
    for sentence in sentences:  # iterate over each sentence
        sentence = sentence.translate(trans_table)  # replace the interpunction with spaces
        word_counter = {}  # initialize a word counting dictionary
        for word in sentence.split():  # split the sentence and iterate over the words
            word = word.lower()  # turn the word to lowercase
            if word not in common_words:  # count only words not in the common.txt
                word_counter[word] = word_counter.get(word, 0) + 1  # increase the last count
        sentences_counter.append(word_counter)  # add the current sentence word count


import collections

with open("common.txt", "r") as f:  # open the `common.txt` for reading
    common_words = {l.strip().lower() for l in f}  # read each line and and add it to a set

interpunction = ";,'\"."  # define word separating characters and create a translation table
trans_table = str.maketrans(interpunction, " " * len(interpunction))

sentences_counter = []  # a list to hold a word count for each sentence

word_counter = collections.defaultdict(int)  # a string:int default dict for counting
with open("sample.txt", "r") as f:  # open the `sample.txt` for reading
    for line in f:  # read the file line by line
        for word in line.translate(trans_table).split():  # remove interpunction and split
            if word.lower() not in common_words:  # count only words not in the common.txt
                word_counter[word.lower()] += 1  # increase the count

print("\n".join("{}: {}".format(w, c) for w, c in word_counter.items()))  # print the counts

