在Scrapy中提取图像
我看过这里的一些其他回答,但我觉得我缺少了一些基本的东西。我正在尝试用CrawlSpider从一个网站提取图片。
settings.py
BOT_NAME = 'healthycomm'
SPIDER_MODULES = ['healthycomm.spiders']
NEWSPIDER_MODULE = 'healthycomm.spiders'
ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}
IMAGES_STORE = '~/Desktop/scrapy_nsml/healthycomm/images'
items.py
class HealthycommItem(scrapy.Item):
page_heading = scrapy.Field()
page_title = scrapy.Field()
page_link = scrapy.Field()
page_content = scrapy.Field()
page_content_block = scrapy.Field()
image_url = scrapy.Field()
image = scrapy.Field()
HealthycommSpider.py
class HealthycommSpiderSpider(CrawlSpider):
name = "healthycomm_spider"
allowed_domains = ["healthycommunity.org.au"]
start_urls = (
'http://www.healthycommunity.org.au/',
)
rules = (Rule(SgmlLinkExtractor(allow=()), callback="parse_items", follow=False), )
def parse_items(self, response):
content = Selector(response=response).xpath('//body')
for nodes in content:
img_urls = nodes.xpath('//img/@src').extract()
item = HealthycommItem()
item['page_heading'] = nodes.xpath("//title").extract()
item["page_title"] = nodes.xpath("//h1/text()").extract()
item["page_link"] = response.url
item["page_content"] = nodes.xpath('//div[@class="CategoryDescription"]').extract()
item['image_url'] = img_urls
item['image'] = ['http://www.healthycommunity.org.au' + img for img in img_urls]
yield item
我对Python不是很熟悉,但我觉得我在这里缺少了一些非常基础的知识。
谢谢,
Jamie
1 个回答
3
如果你想使用标准的 ImagesPipeline
,你需要把你的 parse_items
方法改成类似下面的样子:
import urlparse
...
def parse_items(self, response):
content = Selector(response=response).xpath('//body')
for nodes in content:
# build absolute URLs
img_urls = [urlparse.urljoin(response.url, src)
for src in nodes.xpath('//img/@src').extract()]
item = HealthycommItem()
item['page_heading'] = nodes.xpath("//title").extract()
item["page_title"] = nodes.xpath("//h1/text()").extract()
item["page_link"] = response.url
item["page_content"] = nodes.xpath('//div[@class="CategoryDescription"]').extract()
# use "image_urls" instead of "image_url"
item['image_urls'] = img_urls
yield item
而且你的项目定义里需要有 "images
" 和 "image_urls
" 这两个字段(注意是复数,不是单数)
另外一种方法是设置 IMAGES_URLS_FIELD
和 IMAGES_RESULT_FIELD
来适应你的项目定义