我会给一个网址列表,并希望多进程取得某些网址的网页内容,并退出,如果所有的网址都取得。你知道吗
这是我的实现,不确定是否正确:
#coding=utf8
import multiprocessing
from multiprocessing import JoinableQueue
import urllib2
import logging
import os
logging.basicConfig(level=logging.DEBUG)
URLS = [
'http://stackoverflow.com/q/2243542/94962',
'http://docs.python.org/library/logging.html',
'http://www.python.org/dev/peps/pep-3101/',
'http://news.ycombinator.com/',
'http://www.evernote.com/about/learn_more/',
'http://news.php.net/php.internals/55293',
]
POOL_SIZE = multiprocessing.cpu_count()
DEST_DIR = '/tmp/pytest/'
url_q = JoinableQueue()
class Worker(multiprocessing.Process):
def run(self):
while True:
try:
url = url_q.get()
logging.info('%(process_name)s processing %(url)s' % {
'process_name': multiprocessing.current_process().name,
'url':url,
})
web_cnt = urllib2.urlopen(url).read()
url_filename = url[7:].replace('/', '-').strip('.html') + '.html'
with open(os.path.join(DEST_DIR, url_filename), 'w') as f:
f.write(web_cnt)
url_q.task_done()
except Exception:
logging.exception('error')
workers = []
for i in range(POOL_SIZE):
worker = Worker()
worker.name = 'worker%s'%i
workers.append(worker)
worker.start()
for url in URLS:
url_q.put(url)
url_q.join()
print 'workers have done stuff'
for worker in workers:
worker.terminate()
目前没有回答
相关问题 更多 >
编程相关推荐