Scrapy - 递归爬取到第三页
我希望我的请求对那些经验丰富的Scrapy用户来说是简单明了的。
简单来说,下面的代码可以很好地从第一页的链接抓取第二页的数据。我想把这个代码扩展一下,让它能从第二页的链接抓取第三页的数据。下面的代码中,def parse_items
是起始页面(第一层),这个页面有50个列表项,代码设置为递归地从这50个链接中抓取数据。def parse_listing_page
则指定了要从“列表页面”抓取哪些项目。在每个列表页面中,我希望我的脚本能跟随一个链接进入到另一个页面,并抓取一两个项目,然后再返回到“列表页面”,最后回到起始页面。
下面的代码在两层递归抓取时效果很好。我该如何在下面的代码基础上扩展到三层呢?
from scrapy import log
from scrapy.log import ScrapyFileLogObserver
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from firstproject.items import exampleItem
from scrapy.http import Request
import urlparse
logfile_info = open('example_INFOlog.txt', 'a')
logfile_error = open('example_ERRlog.txt', 'a')
log_observer_info = log.ScrapyFileLogObserver(logfile_info, level=log.INFO)
log_observer_error = log.ScrapyFileLogObserver(logfile_error, level=log.ERROR)
log_observer_info.start()
log_observer_error.start()
class MySpider(CrawlSpider):
name = "example"
allowed_domains = ["example.com.au"]
rules = (Rule (SgmlLinkExtractor(allow=("",),restrict_xpaths=('//li[@class="nextLink"]',))
, callback="parse_items", follow=True),
)
def start_requests(self):
start_urls = reversed([
"http://www.example.com.au/1?new=true&list=10-to-100",
"http://www.example.com.au/2?new=true&list=10-to-100",
"http://www.example.com.au/2?new=true&list=100-to-200",
])
return[Request(url = start_url) for start_url in start_urls ]
def parse_start_url(self, response):
return self.parse_items(response)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
listings = hxs.select("//h2")
items = []
for listings in listings:
item = exampleItem()
item ["title"] = listings.select("a/text()").extract()[0]
item ["link"] = listings.select("a/@href").extract()[0]
items.append(item)
url = "http://example.com.au%s" % item["link"]
yield Request(url=url, meta={'item':item},callback=self.parse_listing_page)
def parse_listing_page(self,response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item["item_1"] = hxs.select('#censored Xpath').extract()
item["item_2"] = hxs.select('#censored Xpath').extract()
item["item_3"] = hxs.select('#censored Xpath').extract()
item["item_4"] = hxs.select('#censored Xpath').extract()
return item
非常感谢
2 个回答
1
这是我更新后的代码。下面的代码能够以合适的格式提取counter_link
(已经测试过了),但是似乎使用了else
语句,所以parse_listing_counter
没有被调用。如果我去掉if
和else
的部分,强制代码调用parse_listing_counter
,它却没有返回任何内容(连parse_items
或列表页面的内容都没有)。
我在代码中做错了什么呢?我也检查过XPaths - 都看起来没问题。
from scrapy import log
from scrapy.log import ScrapyFileLogObserver
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from firstproject.items import exampleItem
from scrapy.http import Request
import urlparse
logfile_info = open('example_INFOlog.txt', 'a')
logfile_error = open('example_ERRlog.txt', 'a')
log_observer_info = log.ScrapyFileLogObserver(logfile_info, level=log.INFO)
log_observer_error = log.ScrapyFileLogObserver(logfile_error, level=log.ERROR)
log_observer_info.start()
log_observer_error.start()
class MySpider(CrawlSpider):
name = "example"
allowed_domains = ["example.com.au"]
rules = (Rule (SgmlLinkExtractor(allow=("",),restrict_xpaths=('//li[@class="nextLink"]',))
, callback="parse_items", follow=True),
)
def start_requests(self):
start_urls = reversed([
"http://www.example.com.au/1?new=true&list=10-to-100",
"http://www.example.com.au/2?new=true&list=10-to-100",
"http://www.example.com.au/2?new=true&list=100-to-200",
])
return[Request(url = start_url) for start_url in start_urls ]
def parse_start_url(self, response):
return self.parse_items(response)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
listings = hxs.select("//h2")
items = []
for listings in listings:
item = exampleItem()
item ["title"] = listings.select("a/text()").extract()[0]
item ["link"] = listings.select("a/@href").extract()[0]
items.append(item)
url = "http://example.com.au%s" % item["link"]
yield Request(url=url, meta={'item':item},callback=self.parse_listing_page)
def parse_listing_page(self,response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item["item_1"] = hxs.select('#censored Xpath').extract()
item["item_2"] = hxs.select('#censored Xpath').extract()
item["item_3"] = hxs.select('#censored Xpath').extract()
item["item_4"] = hxs.select('#censored Xpath').extract()
item["counter_link"] = hxs.selext('#censored Xpath').extract()[0]
counter_link = response.meta.get('counter_link', None)
if counter_link:
url2 = "http://example.com.au%s" % item["counter_link"]
yield Request(url=url2, meta={'item':item},callback=self.parse_listing_counter)
else:
yield item
def parse_listing_counter(self,response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item["counter"] = hxs.select('#censored Xpath').extract()
return item
1
这段代码的运行流程是这样的。
首先,MySpider
类中的 Rule
构造函数被调用。这个 Rule
构造函数设置了一个回调函数,叫做 parse_items
。在 parse_items
的最后有一个 yield
,这使得这个函数会递归调用 parse_listing_page
。如果你想从 parse_listing_page
再递归到第三层,就必须在 parse_listing_page
中有一个 Request
的 yield
。