破蜘蛛只会爬行而不丢弃

from scrapy.spider import BaseSpider from scrapy.selector import HtmlXPathSelector from project2.items import Project2Item from scrapy.http import Request class ProjectSpider(BaseSpider): name = "project2spider" allowed_domains = ["http://directory.thesun.co.uk/"] current_page_no = 1 start_urls = [ 'http://directory.thesun.co.uk/find/uk/computer-repair' ] def get_next_url(self, fired_url): if '/page/' in fired_url: url, page_no = fired_url.rsplit('/page/', 1) else: if self.current_page_no != 1: #end of scroll return self.current_page_no += 1 return "http://directory.thesun.co.uk/find/uk/computer-repair/page/%s" % self.current_page_no # the parse procedure, and here is the codes which declares which field to scrap. def parse(self, response): fired_url = response.url hxs = HtmlXPathSelector(response) sites = hxs.select('//div[@class="abTbl "]') for site in sites: item = Project2Item() item['Catogory'] = site.select('span[@class="icListBusType"]/text()').extract() item['Bussiness_name'] = site.select('a/@title').extract() item['Description'] = site.select('span[last()]/text()').extract() item['Number'] = site.select('span[@class="searchInfoLabel"]/span/@id').extract() item['Web_url'] = site.select('span[@class="searchInfoLabel"]/a/@href').extract() item['adress_name'] = site.select('span[@class="searchInfoLabel"]/span/text()').extract() item['Photo_name'] = site.select('img/@alt').extract() item['Photo_path'] = site.select('img/@src').extract() #items.append(item) yield item next_url = self.get_next_url(fired_url) if next_url: yield Request(next_url, self.parse, dont_filter=True)

1条回答

网友

1楼 · 发布于 2024-04-18 23:08:50

到目前为止你尝试了什么？一种解决方案是在调用下一页时使用一个类似索引的参数作为元数据传递。比如：

def parse(self, response):
    hxs = HtmlXPathSelector(response)
    2nd_xpath = False
    try:
        if response.meta['index'] > 1:
            2nd_xpath = True
        index = response.meta['index']
    except KeyError:
        index = 0
    sites = (hxs.select('//div[@class="icListItem"]') if 2nd_xpath
             else hxs.select('//div[@class="abTbl "]'))

    ...

    request = Request(next_url, self.parse, dont_filter=True)
    request.meta['index'] = index + 1
    yield request

代码当然可以改进，但你明白了。在

相关问题更多 >

编程相关推荐

热门问题

热门文章