重新加载页面

import scrapy import re from icbse.items import IcbseItem class IcbseSpider(scrapy.Spider): name = "icbse" allowed_domains = ["www.icbse.com"] start_urls = [ "http://www.icbse.com/schools/", ] def parse(self, response): for i in xrange(20): # I thought if i iterate the start URL, # I could probably have the page reload. # It didn't work though. for href in response.xpath( '//div[@class="row"]/div[3]//span[@class="list-group-item"]\ /a/@href').extract(): url = response.urljoin(href) yield scrapy.Request(url, callback=self.parse_dir_contents) def parse_dir_contents(self, response): # total number of schools found on page pages = response.xpath( "//div[@class='container']/strong/text()").extract()[0] self.captured_schools_set = set() # Placing the Set here doesn't work! while len(self.captured_schools_set) != int(pages): yield scrapy.Request(response.url, callback=self.reload_url) for school in self.captured_schools_set: yield scrapy.Request(school, callback=self.scrape_school_info) def reload_url(self, response): for school_href in response.xpath( "//h4[@class='school_name']/a/@href").extract(): self.captured_schools_set.add(response.urljoin(school_href)) def scrape_school_info(self, response): item = IcbseItem() try: item["School_Name"] = response.xpath( '//td[@class="tfield"]/strong/text()').extract()[0] except: item["School_Name"] = '' pass try: item["streetAddress"] = response.xpath( '//td[@class="tfield"]')[1].xpath( "//span[@itemprop='streetAddress']/text()").extract()[0] except: item["streetAddress"] = '' pass yield item

1条回答

网友

1楼 · 发布于 2024-05-15 21:43:41

您正在迭代一个空集：

        self.captured_schools_set = set()  # Placing the Set here doesn't work!

        while len(self.captured_schools_set) != int(pages):
            yield scrapy.Request(response.url, callback=self.reload_url)

        for school in self.captured_schools_set:
            yield scrapy.Request(school, callback=self.scrape_school_info)

因此，school的请求永远不会被触发。在

您应该使用dont_filter=True属性重新加载、启动http://www.icbse.com/schools/请求，因为在默认设置中，scrapy将重复项过滤掉。在

但看起来您并没有触发http://www.icbse.com/schools/请求，而是（http://www.icbse.com/schools/state/andaman-nicobar）“/state/name”请求；在上面的第4行中，您正在启动请求.url，有个问题，请改为/学校/

相关问题更多 >

编程相关推荐

热门问题

热门文章