我分析了一个网站,我有一个蜘蛛:
# -*- coding: utf-8 -*-
from quoka.items import QuokaItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader.processors import TakeFirst
from scrapy.loader import XPathItemLoader
from scrapy.selector import HtmlXPathSelector
class QuokaLoader(XPathItemLoader):
default_output_processor = TakeFirst()
class QuokaSpider(CrawlSpider):
name = "quoka"
allowed_domains = ["quoka.de"]
start_urls = ["http://www.quoka.de/immobilien/bueros-gewerbeflaechen/"]
rules = (
Rule(LinkExtractor(allow=('kleinanzeigen/cat_27_2710_ct_0_page_')), follow=True),
Rule(LinkExtractor(allow=('immobilien/bueros-gewerbeflaechen/')), callback='parse_item'),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
l = QuokaLoader(QuokaItem(), hxs)
#
l.add_xpath('date',response.xpath("/html/body/div[3]/div[2]/div[1]/main/div[8]/div/div[2]/strong/span/text()").extract())
l.add_xpath('cost',response.xpath("/html/body/div[3]/div[2]/div[1]/main/div[8]/div/div[3]/div[2]/div[2]/text()").extract())
# l.add_value('url', response.url)
return l.load_item()
输入命令:sudo scrapy crawl quoka_spider.py
但我有个神秘的错误:
^{pr2}$我用的是Ubuntu16.04,python3.5。安装scraby pip3安装scraby。我重新安装Scraby,但没有成功。 如何修复?在
你应该使用蜘蛛的name属性,
而不是:
输入:
^{pr2}$相关问题 更多 >
编程相关推荐