擅长:python、mysql、java
<p>这个想法是把大文件分成更小的文件。调用许多将执行计数作业并返回计数器的工作线程。
最后合并计数器。在</p>
<pre><code>from itertools import islice
from multiprocessing import Pool
from collections import Counter
import os
NUM_OF_LINES = 3
INPUT_FILE = 'huge.txt'
POOL_SIZE = 10
def slice_huge_file():
cnt = 0
with open(INPUT_FILE) as f:
while True:
next_n_lines = list(islice(f, NUM_OF_LINES))
cnt += 1
if not next_n_lines:
break
with open('sub_huge_{}.txt'.format(cnt), 'w') as out:
out.writelines(next_n_lines)
def count_file_words(input_file):
with open(input_file, 'r') as f:
return Counter([w.strip() for w in f.readlines()])
if __name__ == '__main__':
slice_huge_file()
pool = Pool(POOL_SIZE)
sub_files = [os.path.join('.',f) for f in os.listdir('.') if f.startswith('sub_huge')]
results = pool.map(count_file_words, sub_files)
final_counter = Counter()
for counter in results:
final_counter += counter
print(final_counter)
</code></pre>