使用线程获取文件中每个单词的计数

import thread import threading import time import sys class CountWords(threading.Thread): def __init__(self,lock,tuple): threading.Thread.__init__(self) self.lock = lock self.list = tuple[1] self.dit = tuple[0] def run(self): for word in self.list: #self.lock.acquire() if word in self.dit.keys(): self.dit[word] = self.dit[word] + 1 else: self.dit[word] = 1 #self.lock.release() def getWordsFromFile(numThreads, fileName): lists = [] for i in range(int(numThreads)): k = [] lists.append(k) print len(lists) file = open(fileName, "r") # uses .read().splitlines() instead of readLines() to get rid of "\n"s all_words = map(lambda l: l.split(" "), file.read().splitlines()) all_words = make1d(all_words) cur = 0 for word in all_words: lists[cur].append(word) if cur == len(lists) - 1: cur = 0 else: cur = cur + 1 return lists def make1d(list): newList = [] for x in list: newList += x return newList def printDict(dit):# prints the dictionary nicely for key in sorted(dit.keys()): print key, ":", dit[key] if __name__=="__main__": print "Starting now" start = int(round(time.time() * 1000)) lock=threading.Lock() ditList=[] threadList = [] args = sys.argv numThreads = args[1] fileName = "" + args[2] for i in range(int(numThreads)): ditList.append({}) wordLists = getWordsFromFile(numThreads, fileName) zipped = zip(ditList,wordLists) print "got words from file" for tuple in zipped: threadList.append(CountWords(lock,tuple)) for t in threadList: t.start() for t in threadList: if t.isAlive(): t.join() fin = int(round(time.time() * 1000)) - start print "with", numThreads, "threads", "counting the words took :", fin, "ms" #printDict(dit)

2条回答

网友

1楼 · 编辑于 2024-06-08 23:14:20

您可以使用itertools计算文件.below这是一个简单的例子代码.探索itertools.groupby组并根据您的逻辑修改代码。你知道吗

import itertools

tweets = ["I am a cat", "cat", "Who is a good cat"]

words = sorted(list(itertools.chain.from_iterable(x.split() for x in tweets)))
count = {k:len(list(v)) for k,v in itertools.groupby(words)}

网友

2楼 · 编辑于 2024-06-08 23:14:20

由于GIL（What is a global interpreter lock (GIL)?），Python不能并行运行线程（利用多个内核）。你知道吗

此任务的附加线程只会增加代码的开销，使其变得更慢。你知道吗

我可以说有两种情况可以使用线程：

当您有大量的I/O时：线程可以使您的代码并发运行（而不是并行运行https://blog.golang.org/concurrency-is-not-parallelism），因此您的代码可以在等待响应得到很好的加速时做很多事情。你知道吗
您不希望大量计算阻塞您的代码：您可以使用线程与其他任务同时运行此计算。你知道吗

如果您想利用所有的核心，您需要使用多处理模块（https://docs.python.org/3.6/library/multiprocessing.html）。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章