gevent池卡住了
我刚开始接触gevent,但我觉得我在某种程度上搞明白了。简单来说,当我设置的池子大小为1时,代码可以正常运行;但是当池子变大时,代码就会卡住,通常是在第一个池子里(比如说,池子大小为5时,我看到有3个绿色的任务完成了,但就没有更多的了)。这是什么原因呢?是Spawn的问题吗?还是Join的问题?
我不确定远程服务器是否会因为多个请求而感到困惑,但它对于快速连续的单个请求没有问题,所以可能不是这个原因……
(我把代码完整分享出来,因为我不确定问题出在哪里。谢谢你们的耐心。)
from urllib2 import urlopen
from lxml.etree import parse
import os, csv, cStringIO, codecs, pickle
from selenium import webdriver
from time import sleep
import gevent
from gevent import socket
from gevent import monkey, pool
# patches stdlib (including socket and ssl modules) to cooperate with other greenlets
monkey.patch_all()
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
os.chdir('/Users/laszlosandor/Downloads/kozbeszerzes')
HOSTNAME = 'http://kozbeszerzes.ceu.hu'
driver = webdriver.Chrome()
results = set()
for y in xrange(1998,2015):
for p in xrange(0,9999):
driver.get('http://kozbeszerzes.ceu.hu/searchresults.xhtml?q={}&page={}'.format(y,p))
sleep(1)
if len(driver.find_elements_by_class_name('result'))==0:
break
for e in driver.find_elements_by_class_name('result'):
link = e.find_element_by_tag_name('a')
r = link.get_attribute('href').encode('ascii', 'ignore')
if r[:34]== 'http://kozbeszerzes.ceu.hu/tender/':
results.add(r)
driver.quit()
with open('list_of_urls', 'wb') as f:
pickle.dump(results, f)
#with open('list_of_urls', 'r') as f:
# results = pickle.load(f)
entities = set()
header = ('TenderID','RequestorName','URL','Year','RequestorID','Subject','SourceURL','EstValue','Currency','DecisionDate','Value','VAT')
# """Spawn multiple workers and wait for them to complete"""
# # limit ourselves to max 10 simultaneous outstanding requests
p = pool.Pool(10)
f = open('tenders.csv', 'w')
f.write(codecs.BOM_UTF8)
writer = UnicodeWriter(f)
writer.writerow(header)
def workres(res):
try:
tender = parse(urlopen(res)).getroot()
print ('%s succeeded' % res)
for requestor in tender.findall('requestor'):
entities.add(HOSTNAME + requestor.get('url'))
id = tender.get('id')
reqname = tender.get('requestor')
url = tender.get('url')
year = tender.get('year')
reqid = tender.get('requestor_id')
subject = tender.get('subject')
source = tender.get('source_url')
estval = tender.get('estimated_value')
for part in tender.findall('./parts/part'):
winner = part.find('winner')
entities.add(HOSTNAME + winner.get('url'))
curr = part.find('currency').text
date = part.find('decisionDate').text
value = part.find('value').text
vat = part.find('vat').text
row = id, reqname, url, year, reqid, subject, source, estval, curr, date, value, vat
writer.writerow(row)
except socket.gaierror:
ex = sys.exc_info()[1]
print ('%s failed with %s' % (res, ex))
jobs = [p.spawn(workres, res) for res in results]
p.join()
f.close()
with open('entities', 'wb') as f:
pickle.dump(entities, f)
header = ['ID','URL','Name','NominalCity','City', 'ZIP', 'Address']
f = open('entities.csv', 'w')
f.write(codecs.BOM_UTF8)
writer = UnicodeWriter(f)
writer.writerow(header)
def workent(ent):
try:
ent = parse(urlopen(ent)).getroot()
print ('%s succeeded' % ent)
id = ent.get('id')
url = ent.get('url')
name = ent.get('name')
nominalcity = ent.get('city')
cities = ent.findall('./resolved_addresses/whitelistAddress/city')
zips = ent.findall('./resolved_addresses/whitelistAddress/postalCode')
streets = ent.findall('./resolved_addresses/whitelistAddress/street')
for a in xrange(0,len(cities)):
city = cities[a].text
zip = zips[a].text
street = streets[a].text
row = id, url, name, nominalcity, city, zip, street
writer.writerow(row)
except socket.gaierror:
ex = sys.exc_info()[1]
print ('%s failed with %s' % (ent, ex))
jobs = [p.spawn(workent, ent) for ent in entities]
p.join()
f.close()
1 个回答
0
我看到这里有很多错误。
- 没有使用
gevent.sleep()
,而是用了time.sleep
,这会导致程序阻塞。 - 你的变量名太短了。可以给每个代码部分加上描述,比如变量 'p' 被用到了两次……
- 你用 urlopen 和驱动模块获取多个网址,这让人很困惑……
- 我建议在不同的工作线程之间使用队列,只有一个工作线程来处理写入文件的操作,现在你有多个绿色线程同时访问同一个文件……
少用列表推导式,直接写出循环会更清晰。
我建议把 'workres' 中的 try except 只放在 'parse(urlopen())' 代码周围,也许还有其他异常发生,但你现在看不到。