Python 线程池问题(等待某个事件)
我写了一个简单的网站爬虫,使用了线程池。问题是:当爬虫抓取完整个网站后,它应该结束,但实际上它在最后还在等待什么,导致脚本没有完成,这是什么原因呢?
from Queue import Queue
from threading import Thread
import sys
from urllib import urlopen
from BeautifulSoup import BeautifulSoup, SoupStrainer
import re
from Queue import Queue, Empty
from threading import Thread
visited = set()
queue = Queue()
class Worker(Thread):
"""Thread executing tasks from a given tasks queue"""
def __init__(self, tasks):
Thread.__init__(self)
self.tasks = tasks
self.daemon = True
self.start()
def run(self):
while True:
func, args, kargs = self.tasks.get()
print "startcall in thread",self
print args
try: func(*args, **kargs)
except Exception, e: print e
print "stopcall in thread",self
self.tasks.task_done()
class ThreadPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads):
self.tasks = Queue(num_threads)
for _ in range(num_threads): Worker(self.tasks)
def add_task(self, func, *args, **kargs):
"""Add a task to the queue"""
self.tasks.put((func, args, kargs))
def wait_completion(self):
"""Wait for completion of all the tasks in the queue"""
self.tasks.join()
def process(pool,host,url):
try:
print "get url",url
#content = urlopen(url).read().decode(charset)
content = urlopen(url).read()
except UnicodeDecodeError:
return
for link in BeautifulSoup(content, parseOnlyThese=SoupStrainer('a')):
#print "link",link
try:
href = link['href']
except KeyError:
continue
if not href.startswith('http://'):
href = 'http://%s%s' % (host, href)
if not href.startswith('http://%s%s' % (host, '/')):
continue
if href not in visited:
visited.add(href)
pool.add_task(process,pool,host,href)
print href
def start(host,charset):
pool = ThreadPool(7)
pool.add_task(process,pool,host,'http://%s/' % (host))
pool.wait_completion()
start('simplesite.com','utf8')
1 个回答
1
我看到的问题是,你在run里面的while循环永远不会退出。所以,这个循环会一直卡在那里。你需要在工作完成后跳出这个循环。
你可以尝试:
1) 在run中的task.get(...)后面插入
if not func: break
。
2) 在process的最后加上
pool.add_task(None, None, None)
。
这样做是为了让process通知pool,它没有更多的任务需要处理了。