Scrapy：创建爬行索引页并保存每个相应Lin的整个HTML页面的Spider

import urlparse import scrapy from scrapy.http import Request class BasicSpider(scrapy.Spider): name = "basic" allowed_domains = ["web"] start_urls = ( 'http://books.toscrape.com/', ) def parse(self, response): # My Link Extractor next_selector = response.xpath( '//*[@class="nav nav-list"]/li/ul/li/a/@href' ) for url in next_selector.extract(): yield Request(urlparse.urljoin(response.url, url), callback=self.parse_item) def parse_item(self, response): # My Page Saver filename = response.url.split("/")[-1] + '.html' with open(filename, 'wb') as f: f.write(response.body) return

1条回答

网友

1楼 · 发布于 2024-05-19 20:12:57

import urlparse
import scrapy
from scrapy.linkextractors import LinkExtractor

from scrapy.http import Request


class BooksSpider(scrapy.Spider):
    name = "basic"
    allowed_domains = ["toscrape"]
    #                 ^ allowed domain should be name of domain that you wanna scrap
    start_urls = (
        'http://books.toscrape.com/',
    )

    def parse(self, response):
    # My Link Extractor
        next_page_urls = LinkExtractor(restrict_xpaths='//*[@class="next"]').extract_links(response)
        # This is how we use LinkExtractor or you can create spider Rule for next page.
        # Read more about LinkExtractor form https://doc.scrapy.org/en/latest/topics/link-extractors.html
        for next_page in next_page_urls:
            yield Request(next_page.url,callback=self.parse_item)

    def parse_item(self, response):
    # My Page Saver    
        filename = response.url.split("/")[-1] + '.html'
        with open(filename, 'wb') as f:
            f.write(response.body)
            return

我希望这对你有帮助

相关问题更多 >

编程相关推荐

热门问题

热门文章