如何使用Scrapy的Item Pipeline将抓取的项目存储到数据库中?
我想用项目管道把抓取到的内容存储到数据库里。
这是我的爬虫代码:
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from scrapy.utils.python import unicode_to_str
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.exceptions import ScrapyDeprecationWarning
from CollecteurImmobilier.items import CollecteurimmobilierItem
class AnnonceSpider(CrawlSpider):
name = "Annonce"
allowed_domains = ["tayara.tn"]
start_urls = ["http://www.tayara.tn/sousse/immobilier-%C3%A0_vendre"]
rules = (Rule(SgmlLinkExtractor(allow=('\\?o=\\d')),'parse_start_url',follow=True),)
def parse_start_url(self, response):
sel = Selector(response)
DivAnnonces = sel.xpath('//div[@class="item"]')
items = []
for DivAnnonce in DivAnnonces:
item = CollecteurimmobilierItem()
item['link'] = DivAnnonce.xpath('.//h2/a/@href').extract()
titres = item['link']
items.append(item)
return items
这是我的管道代码:
from datetime import datetime
from hashlib import md5
from scrapy import log
from scrapy.exceptions import DropItem
from twisted.enterprise import adbapi
import sys
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
class MySQLStorePipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
dbargs = dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],
passwd=settings['MYSQL_PASSWD'],
charset='utf8',
use_unicode=True,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
return cls(dbpool)
def process_item(self, item, spider):
# run db query in the thread pool
query = self.dbpool.runInteraction(self._conditional_insert, item, spider)
query.addErrback(self._handle_error, item, spider)
# at the end return the item in case of success or failure
query.addBoth(lambda _: item)
# return the deferred instead the item. This makes the engine to
# process next item (according to CONCURRENT_ITEMS setting) after this
# operation (deferred) has finished.
return query
def _conditional_insert(self, tx, item, spider):
tx.execute("""
SELECT * FROM AnnonceGratuit WHERE link = %s
""", (item['link']))
result = tx.fetchone()
if result:
print "Welcome to Python!"
log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
else:
tx.execute("""
INSERT INTO AnnonceGratuit (link)
VALUES (%s)
""", (item['link'])
)
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def _handle_error(self, failure, item, spider):
"""Handle occurred on db interaction."""
# do nothing, just log
log.err(failure)
这是我的mysql.sql文件:
DROP TABLE IF EXISTS AnnonceGratuit;
CREATE TABLE AnnonceGratuit (
link VARCHAR,
title VARCHAR
) DEFAULT CHARSET=utf8;
在我的设置里,我添加了这一行:
ITEM_PIPELINES = {
'CollecteurImmobilier.pipelines.MySQLStorePipeline': 300,
}
但是当我这样运行我的爬虫时:
scrapy crawl Annonce -o items.xml -t xml
我的终端没有报错,
而且在爬虫运行时我看到这个信息:“项目已经存储在数据库中”。
我成功输出了文件items.xml,但我的数据库里什么也没有存储。
请问有人能帮我想想办法吗?
1 个回答
0
尝试使用数据库索引来检测重复项
def _conditional_insert(self, tx, item, spider):
try:
tx.execute("""
INSERT INTO AnnonceGratuit (link)
VALUES (%s)
""", (item['link'])
)
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
except:
log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
如果你在数据库中的链接上添加了唯一索引约束,这样应该就能正常工作了。