即使在我收集和解析链接时，Scrapy crawler也不会跟踪链接

# -*- coding: utf-8 -*- import scrapy import logging from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy import Request from banker.items import BarclaysOfferItem class BarclaySpider(CrawlSpider): name = "barclay" allowed_domains = ['partners.barclaycardrewardsboost.com/'] start_urls = [ 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=1&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%\20Online%\20Offers' # 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=2&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%\20Online%\20Offers' # 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=3&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%\20Online%\20Offers', # 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=4&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%\20Online%\20Offers', # 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=5&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%\20Online%\20Offers', # 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=6&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%\20Online%\20Offers', # 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=7&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%\20Online%\20Offers' ] def parse(self, response): base = 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm' links = response.xpath('//p[contains(@class, "mn_pageLinks")]/a') for sel in links: url = base + str(sel.xpath('@href').extract()[0]) logging.info(url) yield scrapy.Request(url, callback=self.parse_item) def parse_item(self, reponse): for sel in response.xpath('//table/tr'): item = BarclaysOfferItem() item['merchant'] = sel.xpath('td/div/a[last()]/text()').extract() item['rate'] = sel.xpath('td/span/a/text()').extract() item['offer'] = sel.xpath('td/a[last()]/text()').extract() item['coupon_code'] = sel.xpath('td[@class="mn_cpCode"]/text()').extract() item['expiration_date'] = sel.xpath('td[@class="mn_expiry"]/text()').extract() yield item

1条回答

网友

1楼 · 发布于 2024-04-26 06:26:28

我终于成功了！你知道吗

根据scrapy documentation，如果启用了offseitemiddleware，则不会为请求读取不在allowed_domains列表中的域。我知道我的网址是在指定的域，但我想，网站查询数据的方式使网址似乎是异地的。你知道吗

# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import Spider, Rule
from scrapy.linkextractors import LinkExtractor
import logging
from banker.items import BarclaysOfferItem

class BarclaySpider(Spider):
    name = "barclay"
    start_urls = [
        'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=1&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%\20Online%\20Offers'  

    ]

    # Parse for the links of interest
    def parse(self, response):

        base = 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm'
        links = response.xpath('//p[contains(@class, "mn_pageLinks")]/a')
        for sel in links:
            url = base + str(sel.xpath('@href').extract()[0])
            logging.info(url)
            yield scrapy.Request(url, callback=self.parse_item)    

    # parse for the items of interest
    def parse_item(self, response):
        for sel in response.xpath('//table/tr'):
            item = BarclaysOfferItem()
            item['merchant'] = sel.xpath('td/div/a[last()]/text()').extract()
            item['rate'] = sel.xpath('td/span/a/text()').extract()
            item['offer'] = sel.xpath('td/a[last()]/text()').extract()
            item['coupon_code'] = sel.xpath('td[@class="mn_cpCode"]/text()').extract()
            item['expiration_date'] = sel.xpath('td[@class="mn_expiry"]/text()').extract()
            yield item

相关问题更多 >

编程相关推荐

热门问题

热门文章