class MySpider(BaseSpider):
allowed_domains = ['domain1.com', 'domain2.com', ... 'domain10.com']
start_urls = ['http://domain1.com/index.html',
'http://domain2.com/a.html',
...,
'http://domain10.com/b.html']
...
def __init__(self, *a, **kw):
super(MySpider, self).__init__(*a, **kw)
self.domain_counters = collections.defaultdict(set)
def parse(self, response):
....
for link in sel.xpath("//a[contains(@href, '.pdf')"):
# extract domain,
# note that url can be relative and take it from response
domain = urlparse(...)
# keep generating Requests as long as len(self.domain_counters[domain]) < 10
self.domain_counters[domain].add(link)
if len(self.domain_counters[domain]) < 10
yield Request(url=link, callback=self.download_pdf)
scrapy是关于并行性的,可以做些类似的事情:
请注意,并行地聚集几个域可能会减慢每个域的点击节奏,因此更礼貌一些
相关问题 更多 >
编程相关推荐