有没有一种方法可以使用selenium来处理javascript和LinkExtractor?我想爬网的网站和拿起pdf文件的网页。许多pdf只有在javascript被执行之后才可用。在
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
from selenium import webdriver
from scrapy.http import Request
class Fetcher(CrawlSpider):
name = "Fetcher"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/"]
rules = [Rule(LinkExtractor(allow=()), follow=True, callback='parse_item')]
def __init__(self, *a, **kw):
super(Fetcher, self).__init__(*a, **kw)
self.driver = webdriver.PhantomJS
self.links = open("links.txt", "a")
self.pdfs = open("pdfs.txt", "a")
def parse_start_url(self, response):
#Do stuff
def parse_item(self, response):
#Do stuff
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(Fetcher)
process.start() # the script will block here until the crawling is finished
我建议您使用DownloaderMiddleware来获取这些请求,并使用selenium下载页面并将HtmlResponse返回给spider。像这样:
请记住将中间件添加到您的设置中:
^{pr2}$相关问题 更多 >
编程相关推荐