scrapy: CrawlSpider中的'KeyError'异常

Question

我正在尝试从以下网站提取所有相关信息，以便将所有数据加载到电子表格中：

http://yellowpages.com.gh/Home.aspx?

我猜想我需要使用CrawlSpider，所以我一直在尝试构建这个：

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.item import Item
class YellowGH2Spider(CrawlSpider):
    name = "yellowGH2"
    allowed_domains = ["yellowpages.com.gh"]
    start_urls = ["http://yellowpages.com.gh/Home.aspx"]
    rules = (
        Rule(SgmlLinkExtractor(allow=(r'http://yellowpages.com.gh/Home.aspx?mcaid=\d+#tabs-2', ))),
        Rule(SgmlLinkExtractor(allow=(r'http://yellowpages.com.gh/(Home|Search-Results).aspx?mcaid=[0-9&eca1id=]+(&lcaid=)?\d+#tabs-2', )), callback='parse_item'),
        Rule(SgmlLinkExtractor(allow=(r'http://yellowpages.com.gh/Company-Details/[a-zA-Z0-9-]+.aspx?returnurl=/Search-Results.aspx', )), callback='parse_item'),
        )
    def parse(self, response):
        #hxs = HtmlXPathSelector(response)
        #filename = response.url.split("/")[-2]
        #open(filename, 'wb').write(response.body)

        sel = Selector(response)
        item = Item()
        #item['catName']=sel.xpath('//div[@class="oneDirCat"]/h3/a/text()').extract()
        item['catLink']=sel.xpath('//div[@class="oneDirCat"]/h3/a/@href').extract()
        item['subcatText']=sel.xpath('//ul/li/a/@href').extract()
        item['subcatLink']=sel.xpath('//div[@class="oneDirCat"]/h3/a/text()').extract()
        item['company']=sel.xpath('//label/text()').extract()
        item['more']=sel.xpath('//td[@valign="bottom"]/a/@href').extract()
        item['address']=sel.xpath('//td[2]/text()').extract()
        item['postAddress']=sel.xpath('//td[4]/text()').extract()
        item['city']=sel.xpath('//td[6]/text()').extract()
        item['region']=sel.xpath('//td[8]/text()').extract()
        item['mobile']=sel.xpath('//td[12]/text()').extract()
        item['emailtext']=sel.xpath('//td[16]/a/text()').extract()
        item['emailLink']=sel.xpath('//td[16]/a/@href').extract()
        item['webtext']=sel.xpath('//td[18]/a/text()').extract()
        item['webLink']=sel.xpath('//td[18]/a/@href').extract()
        return item


            #print catName, catLink, subcatText, subcatLink, company, more,
            #address, postAddress, city, region, mobile, emailtext, emailLink,
            #webtext, webLink

但是，当我在命令提示符下运行这个时，出现了以下错误：

exceptions.KeyError: 'Item does not support field: catLink'

出现这种错误的最可能原因是什么呢？这可能和我的XPaths格式有关吗？还是说这可能和这个爬虫与项目中原始爬虫共享同一个items.py文件有关？

我的items.py代码如下：

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy.item import Item, Field

class YellowghItem(Item):
    # define the fields for your item here like:
    # name = Field()
      catName = Field()
      catLink = Field()
      subcatText = Field()
      subcatLink = Field()
      company = Field()
      more = Field()
      address = Field()
      postAddress = Field()
      city = Field()
      region = Field()
      mobile = Field()
      emailtext = Field()
      emailLink = Field()
      webtext = Field()
      webLink = Field()

      #pass

异常处理数据提取 xpath scrapy 爬虫技术 keyerror crawlspider items.py

scrapy: CrawlSpider中的'KeyError'异常

1 个回答

撰写回答