下载scrapy图片时出错

1 投票
1 回答
550 浏览
提问于 2025-04-18 06:32

我有一个 scrapy spider,用来从一些电商网站抓取图片和内容。现在我想下载图片,我写了一些代码,但出现了这个错误:

..

          File "/usr/lib/python2.7/pprint.py", line 238, in format
            return _safe_repr(object, context, maxlevels, level)
          File "/usr/lib/python2.7/pprint.py", line 282, in _safe_repr
            vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level)
          File "/usr/lib/python2.7/pprint.py", line 323, in _safe_repr
            rep = repr(object)
          File "/usr/local/lib/python2.7/dist-packages/Scrapy-0.23.0-py2.7.egg/scrapy/item.py", line 77, in __repr__
            return pformat(dict(self))
          File "/usr/lib/python2.7/pprint.py", line 63, in pformat
            return PrettyPrinter(indent=indent, width=width, depth=depth).pformat(object)
          File "/usr/lib/python2.7/pprint.py", line 122, in pformat
            self._format(object, sio, 0, 0, {}, 0)
          File "/usr/lib/python2.7/pprint.py", line 140, in _format
            rep = self._repr(object, context, level - 1)
          File "/usr/lib/python2.7/pprint.py", line 226, in _repr
            self._depth, level)
          File "/usr/lib/python2.7/pprint.py", line 238, in format
            return _safe_repr(object, context, maxlevels, level)
          File "/usr/lib/python2.7/pprint.py", line 282, in _safe_repr
            vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level)
          File "/usr/lib/python2.7/pprint.py", line 323, in _safe_repr
            rep = repr(object)
          File "/usr/local/lib/python2.7/dist-packages/Scrapy-0.23.0-py2.7.egg/scrapy/item.py", line 77, in __repr__
            return pformat(dict(self))
          File "/usr/lib/python2.7/pprint.py", line 63, in pformat
            return PrettyPrinter(indent=indent, width=width, depth=depth).pformat(object)
          File "/usr/lib/python2.7/pprint.py", line 122, in pformat
            self._format(object, sio, 0, 0, {}, 0)
          File "/usr/lib/python2.7/pprint.py", line 140, in _format
            rep = self._repr(object, context, level - 1)
          File "/usr/lib/python2.7/pprint.py", line 226, in _repr
            self._depth, level)
          File "/usr/lib/python2.7/pprint.py", line 238, in format
            return _safe_repr(object, context, maxlevels, level)
          File "/usr/lib/python2.7/pprint.py", line 280, in _safe_repr
            for k, v in _sorted(object.items()):
          File "/usr/lib/python2.7/pprint.py", line 78, in _sorted
            with warnings.catch_warnings():
        exceptions.RuntimeError: maximum recursion depth exceeded

我的 spider 是:

from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request

from loom.items import LoomItem
import sys


from scrapy.contrib.loader import XPathItemLoader

from scrapy.utils.response import get_base_url
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

class LoomSpider(CrawlSpider):
    name = "loom_org"
    allowed_domains = ["2loom.com"]
    start_urls = [
        "http://2loom.com",
        "http://2loom.com/collections/basic",
        "http://2loom.com/collections/design",
        "http://2loom.com/collections/tum-koleksiyon"
    ]

    rules = [
           Rule(SgmlLinkExtractor(allow='products'), callback='parse_items',follow = True),
           Rule(SgmlLinkExtractor(allow=()), follow=True),
       ]    

    def parse_items(self, response):
        sys.setrecursionlimit(10000)        

        item = LoomItem()

        items = []
        sel = Selector(response)
        name = sel.xpath('//h1[@itemprop="name"]/text()').extract()
        brand = "2loom"
        price_lower = sel.xpath('//h1[@class="product-price"]/text()').extract()
        price = "0"
        image = sel.xpath('//meta[@property="og:image"]/@content').extract()
        description = sel.xpath('//meta[@property="og:description"]/@content').extract()

        print image

        ##image indiriliyor

        loader = XPathItemLoader(item, response = response)
        loader.add_xpath('image_urls', '//meta[@property="og:image"]/@content')     


        ##ID Split ediliyor (10. Design | Siyah & beyaz kalpli)

        id = name[0].strip().split(". ")
        id = id[0]

        item['id'] = id
        item['name'] = name
        item['url'] = response.url
        item['image'] = loader.load_item()
        item['category'] = "Basic"
        item['description'] = description
        item["brand"] = "2Loom"
        item['price'] = price
        item['price_lower'] = price_lower


        print item


        items.append(item)
        return items


Items

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy.item import Item, Field

class LoomItem(Item):
    # define the fields for your item here like:
    # name = Field()

    id = Field()
    name = Field()
    brand = Field()
    image = Field()
    category = Field()
    description = Field()
    price_lower = Field()
    price = Field()
    url = Field()
    images = Field()
    image_urls = Field()    

Pipeline 是:

from scrapy.contrib.pipeline.images import ImagesPipeline, ImageException
from scrapy.http import Request
from cStringIO import StringIO
import psycopg2
import hashlib
from scrapy.conf import settings

class MyImagePipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        return [Request(x) for x in item.get('image_urls', [])]   

    def item_completed(self, results, item, info):
        item['images'] = [x for ok, x in results if ok]
        return item

    # Override the convert_image method to disable image conversion    
    def convert_image(self, image, size=None):
        buf = StringIO()        
        try:
            image.save(buf, image.format)
        except Exception, ex:
            raise ImageException("Cannot process image. Error: %s" % ex)

        return image, buf    

    def image_key(self, url):
        image_guid = hashlib.sha1(url).hexdigest()
        return 'full/%s.jpg' % (image_guid)  

Settings 是:

BOT_NAME = 'loom'

SPIDER_MODULES = ['loom.spiders']
NEWSPIDER_MODULE = 'loom.spiders'


DOWNLOAD_DELAY     = 5 

ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}
IMAGES_STORE = '/root/loom/images/'

IMAGES_THUMBS = {
    'small': (90, 90),
    'big': (300, 300),
}

USER_AGENT     = "Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0"
IM_MODULE      = 'loom.pipelines.MyImagePipeline'
ITEM_PIPELINES = ['loom.pipelines.MyImagePipeline']



LOG_LEVEL = 'INFO'  

我不知道为什么会出现这个错误。所以感谢大家的帮助。

1 个回答

1

试着在Spyder中把递归限制改成 sys.setrecursionlimit(10000)。我的Python解释器在遇到“运行时错误”之前只能进行900次递归。

撰写回答