使用含硒的废LinkExtractor

from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from scrapy.crawler import CrawlerProcess from scrapy.selector import Selector from selenium import webdriver from scrapy.http import Request class Fetcher(CrawlSpider): name = "Fetcher" allowed_domains = ["example.com"] start_urls = ["http://www.example.com/"] rules = [Rule(LinkExtractor(allow=()), follow=True, callback='parse_item')] def __init__(self, *a, **kw): super(Fetcher, self).__init__(*a, **kw) self.driver = webdriver.PhantomJS self.links = open("links.txt", "a") self.pdfs = open("pdfs.txt", "a") def parse_start_url(self, response): #Do stuff def parse_item(self, response): #Do stuff process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) process.crawl(Fetcher) process.start() # the script will block here until the crawling is finished

1条回答

网友

1楼 · 发布于 2024-05-23 13:54:31

我建议您使用DownloaderMiddleware来获取这些请求，并使用selenium下载页面并将HtmlResponse返回给spider。像这样：

from selenium import webdriver
from scrapy.http import HtmlResponse

class SeleniumMiddleware(object):

    def __init__(self):
        self.driver = webdriver.PhantomJS() # Or whichever browser you want

    # Here you get the request you are making to the urls which your LinkExtractor found and use selenium to get them and return a response.
    def process_request(self, request, spider):
        self.driver.get(request.url)
        body = self.driver.page_source
        return HtmlResponse(self.driver.current_url, body=body, encoding='utf-8', request=request)

请记住将中间件添加到您的设置中：

^{pr2}$

相关问题更多 >

编程相关推荐

热门问题

热门文章