CrawlSpider无法遵循某些网站的规则
我正在尝试开始我的第一个Scrapy项目,但遇到了一个奇怪的问题。对于某些网站,我的爬虫工作得很好,但对于其他一些网站,它却没有按照提取链接的规则来操作。我在StackOverflow上搜索过,发现其他人也遇到过类似的问题,但他们的情况是因为allow
参数格式不正确,导致出现Filtered offsite request
的错误,而我并没有这个问题。我的日志在这里 http://pastebin.com/r1pXmeJW(先是失败的链接,然后是正常工作的链接,因为我不能发超过两个链接……)。
我的爬虫是通过一个使用API的Python脚本来控制的:
# -*- coding: utf-8 -*-
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from scrapy.utils.project import get_project_settings
from govcrawl.spiders.main_spider import DomainSpider
import sys, urlparse, re
from scrapy.contrib.spiders import Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
args = sys.argv[1].split('§')
url_id = args[0]
start_url = args[1]
url_parts = urlparse.urlparse(start_url)
allowed_domain = url_parts.netloc
allowed_path = '/'.join(url_parts.path.split('/')[:-1])
cur_state = sys.argv[2]
spider = DomainSpider(
start_urls = [start_url],
allowed_domains = [allowed_domain],
url_id = url_id,
cur_state = cur_state,
rules = (
Rule(
LxmlLinkExtractor(
allow = re.compile(r".*%s.*" % re.escape(allowed_path), re.IGNORECASE),
allow_domains = [allowed_domain],
tags = ('a', 'area', 'frame'),
attrs = ('href', 'src')
),
callback = "parse_items",
follow = True
),
)
)
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal = signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()
这是我的DomainSpider
:
import re
from govcrawl.items import DomainItem
from scrapy.utils.markup import remove_tags
from scrapy.contrib.spiders import CrawlSpider
from scrapy import log
class DomainSpider(CrawlSpider):
name = "govcrawl_main"
def parse_start_url(self, response):
return self.parse_items(response)
def parse_items(self, response):
pages_done = self.crawler.stats.get_value('downloader/response_count')
pages_todo = self.crawler.stats.get_value('scheduler/enqueued') - self.crawler.stats.get_value('downloader/response_count')
log.msg("URL: %s (%s) Crawled %d pages. To Crawl: %d" % (self.start_urls[0], self.url_id, pages_done, pages_todo), spider = self)
links = []
for sel in response.xpath('//a'):
href = sel.xpath('@href').extract()
if len(href) > 0:
href = href[0]
if href.startswith("http"):
links.append(href)
item = DomainItem()
item["url"] = response.url
item["text"] = re.sub(r'\s{2,}', ' ', remove_tags(' '.join(response.xpath('//body//text()').extract()))).strip()
item["links"] = links
self.crawler.stats.inc_value('pages_crawled')
yield item
有没有什么办法可以让爬虫在那些失败的网站上也遵循规则呢?
1 个回答
结果发现,返回错误的页面有一些格式不正确的HTML代码,里面有多个</html>
标签,这让lxml
解析器很不喜欢。因为scrapy
不允许你用不同的解析器来使用CrawlSpider
,所以我最终重新实现了一个普通的Spider
对象,它的行为和CrawlSpider
差不多:
import urlparse, re
from scrapy import Spider, log
from bs4 import BeautifulSoup
from scrapy.http import Request
from govcrawl.items import DomainItem
class DomainSimpleSpider(Spider):
name = "govcrawl_simple"
def parse(self, response):
pages_done = self.crawler.stats.get_value('downloader/response_count')
pages_todo = self.crawler.stats.get_value('scheduler/enqueued') - self.crawler.stats.get_value('downloader/response_count')
log.msg("URL: %s (%s) Crawled %d pages. To Crawl: %d" % (self.start_urls[0], self.url_id, pages_done, pages_todo), spider = self)
#import ipdb
#ipdb.set_trace()
soup = BeautifulSoup(response._body, "html5lib")
links = []
for tag in self.tags:
for a in soup.find_all(tag):
for attr in self.attrs:
if attr in a.attrs:
href = a.attrs[attr]
if href.startswith("http"):
links.append(href)
href = urlparse.urljoin(response.url, href)
href_parts = urlparse.urlparse(href.replace('\t', '').replace('\r', '').replace('\n', '').replace(' ', '+'))
if re.match(self.allow, href_parts.path) and not self.forbidden_extension(href_parts.path):
yield Request(href)
for script in soup(["script", "style"]):
script.extract()
item = DomainItem()
item["url"] = response.url
#item["text"] = re.sub(r'\s{2,}', ' ', remove_tags(' '.join(response.xpath('//body//text()').extract()))).strip()
item["text"] = soup.get_text()
item["links"] = links
self.crawler.stats.inc_value('pages_crawled')
yield item
def forbidden_extension(self, url):
url = url.lower()
return url.endswith("pdf") or url.endswith("jpg") or url.endswith("wmv") or url.endswith("avi") or url.endswith("pptx") or url.endswith("gif") or url.endswith("mp3") or url.endswith("mp4") or url.endswith("wav") or url.endswith("mov") or url.endswith("ppt") or url.endswith("xls") or url.endswith("doc") or url.endswith("docx") or url.endswith("xlsx") or url.endswith("flv") or url.endswith("wma") or url.endswith("jpeg") or url.endswith("png") or url.endswith("odf") or url.endswith("ods") or url.endswith("zip") or url.endswith("gz") or url.endswith("tar") or url.endswith("7z") or url.endswith("rar") or url.endswith("vob")
这个爬虫可以通过以下Python脚本来控制:
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from scrapy.utils.project import get_project_settings
from govcrawl.spiders.simple_spider import DomainSimpleSpider
import urlparse, re
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
start_url = ...
url_parts = urlparse.urlparse(start_url)
allowed_domain = url_parts.netloc
allowed_path = '/'.join(url_parts.path.split('/')[:-1])
spider = DomainSimpleSpider(
start_urls = [start_url],
allowed_domains = [allowed_domain],
allow = re.compile(r".*%s.*" % re.escape(allowed_path), re.IGNORECASE),
tags = ('a', 'area', 'frame'),
attrs = ('href', 'src'),
response_type_whitelist = [r"text/html", r"application/xhtml+xml", r"application/xml"]
)
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal = signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()
注意:
我使用了
BeautifulSoup
中的html5lib
解析器,而不是lxml
。html5lib
可以很好地处理多个</html>
标签,但它是一个外部依赖,所以你需要安装它。出于某种原因,文件类型检查似乎不太好用。因此我添加了一个
forbidden_extensions
函数,防止对非html
文件创建Request
请求,并且我还需要添加另一个DownloaderMiddleware
,利用爬虫的response_type_whitelist
(有关中间件实现的更多信息,请查看Python Scrapy - 基于文件类型的过滤器以避免下载非文本文件)。这个爬虫似乎会处理起始页面两次,但老实说,我并不在乎去修复这个问题。