下载scrapy图片时出错

Question

我有一个 scrapy spider，用来从一些电商网站抓取图片和内容。现在我想下载图片，我写了一些代码，但出现了这个错误：

..

          File "/usr/lib/python2.7/pprint.py", line 238, in format
            return _safe_repr(object, context, maxlevels, level)
          File "/usr/lib/python2.7/pprint.py", line 282, in _safe_repr
            vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level)
          File "/usr/lib/python2.7/pprint.py", line 323, in _safe_repr
            rep = repr(object)
          File "/usr/local/lib/python2.7/dist-packages/Scrapy-0.23.0-py2.7.egg/scrapy/item.py", line 77, in __repr__
            return pformat(dict(self))
          File "/usr/lib/python2.7/pprint.py", line 63, in pformat
            return PrettyPrinter(indent=indent, width=width, depth=depth).pformat(object)
          File "/usr/lib/python2.7/pprint.py", line 122, in pformat
            self._format(object, sio, 0, 0, {}, 0)
          File "/usr/lib/python2.7/pprint.py", line 140, in _format
            rep = self._repr(object, context, level - 1)
          File "/usr/lib/python2.7/pprint.py", line 226, in _repr
            self._depth, level)
          File "/usr/lib/python2.7/pprint.py", line 238, in format
            return _safe_repr(object, context, maxlevels, level)
          File "/usr/lib/python2.7/pprint.py", line 282, in _safe_repr
            vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level)
          File "/usr/lib/python2.7/pprint.py", line 323, in _safe_repr
            rep = repr(object)
          File "/usr/local/lib/python2.7/dist-packages/Scrapy-0.23.0-py2.7.egg/scrapy/item.py", line 77, in __repr__
            return pformat(dict(self))
          File "/usr/lib/python2.7/pprint.py", line 63, in pformat
            return PrettyPrinter(indent=indent, width=width, depth=depth).pformat(object)
          File "/usr/lib/python2.7/pprint.py", line 122, in pformat
            self._format(object, sio, 0, 0, {}, 0)
          File "/usr/lib/python2.7/pprint.py", line 140, in _format
            rep = self._repr(object, context, level - 1)
          File "/usr/lib/python2.7/pprint.py", line 226, in _repr
            self._depth, level)
          File "/usr/lib/python2.7/pprint.py", line 238, in format
            return _safe_repr(object, context, maxlevels, level)
          File "/usr/lib/python2.7/pprint.py", line 280, in _safe_repr
            for k, v in _sorted(object.items()):
          File "/usr/lib/python2.7/pprint.py", line 78, in _sorted
            with warnings.catch_warnings():
        exceptions.RuntimeError: maximum recursion depth exceeded

我的 spider 是：

from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request

from loom.items import LoomItem
import sys


from scrapy.contrib.loader import XPathItemLoader

from scrapy.utils.response import get_base_url
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

class LoomSpider(CrawlSpider):
    name = "loom_org"
    allowed_domains = ["2loom.com"]
    start_urls = [
        "http://2loom.com",
        "http://2loom.com/collections/basic",
        "http://2loom.com/collections/design",
        "http://2loom.com/collections/tum-koleksiyon"
    ]

    rules = [
           Rule(SgmlLinkExtractor(allow='products'), callback='parse_items',follow = True),
           Rule(SgmlLinkExtractor(allow=()), follow=True),
       ]    

    def parse_items(self, response):
        sys.setrecursionlimit(10000)        

        item = LoomItem()

        items = []
        sel = Selector(response)
        name = sel.xpath('//h1[@itemprop="name"]/text()').extract()
        brand = "2loom"
        price_lower = sel.xpath('//h1[@class="product-price"]/text()').extract()
        price = "0"
        image = sel.xpath('//meta[@property="og:image"]/@content').extract()
        description = sel.xpath('//meta[@property="og:description"]/@content').extract()

        print image

        ##image indiriliyor

        loader = XPathItemLoader(item, response = response)
        loader.add_xpath('image_urls', '//meta[@property="og:image"]/@content')     


        ##ID Split ediliyor (10. Design | Siyah & beyaz kalpli)

        id = name[0].strip().split(". ")
        id = id[0]

        item['id'] = id
        item['name'] = name
        item['url'] = response.url
        item['image'] = loader.load_item()
        item['category'] = "Basic"
        item['description'] = description
        item["brand"] = "2Loom"
        item['price'] = price
        item['price_lower'] = price_lower


        print item


        items.append(item)
        return items


Items

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy.item import Item, Field

class LoomItem(Item):
    # define the fields for your item here like:
    # name = Field()

    id = Field()
    name = Field()
    brand = Field()
    image = Field()
    category = Field()
    description = Field()
    price_lower = Field()
    price = Field()
    url = Field()
    images = Field()
    image_urls = Field()

Pipeline 是：

from scrapy.contrib.pipeline.images import ImagesPipeline, ImageException
from scrapy.http import Request
from cStringIO import StringIO
import psycopg2
import hashlib
from scrapy.conf import settings

class MyImagePipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        return [Request(x) for x in item.get('image_urls', [])]   

    def item_completed(self, results, item, info):
        item['images'] = [x for ok, x in results if ok]
        return item

    # Override the convert_image method to disable image conversion    
    def convert_image(self, image, size=None):
        buf = StringIO()        
        try:
            image.save(buf, image.format)
        except Exception, ex:
            raise ImageException("Cannot process image. Error: %s" % ex)

        return image, buf    

    def image_key(self, url):
        image_guid = hashlib.sha1(url).hexdigest()
        return 'full/%s.jpg' % (image_guid)

Settings 是：

BOT_NAME = 'loom'

SPIDER_MODULES = ['loom.spiders']
NEWSPIDER_MODULE = 'loom.spiders'


DOWNLOAD_DELAY     = 5 

ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}
IMAGES_STORE = '/root/loom/images/'

IMAGES_THUMBS = {
    'small': (90, 90),
    'big': (300, 300),
}

USER_AGENT     = "Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0"
IM_MODULE      = 'loom.pipelines.MyImagePipeline'
ITEM_PIPELINES = ['loom.pipelines.MyImagePipeline']



LOG_LEVEL = 'INFO'

我不知道为什么会出现这个错误。所以感谢大家的帮助。

错误处理爬虫图片下载电商数据抓取

下载scrapy图片时出错

1 个回答

撰写回答