Scrapy的pipeline.py未将爬虫项目插入到MYSQL中
我正在使用scrapy来抓取新闻标题,但我对scrapy和抓取整体来说还是个新手。这几天我在把抓取到的数据放入我的SQL数据库时遇到了很大的问题。
在我的pipelines.py文件中,有两个类,一个是用来把数据插入数据库,另一个是为了前端网页开发,把抓取到的数据备份到json文件中。
这是我的爬虫代码:
- 它从
start_urls
中提取新闻标题。 - 它使用
extract()
把这些数据提取为字符串,然后循环遍历这些字符串,使用strip()
去掉多余的空格,以便更好地格式化。
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.item import Item
from Aljazeera.items import AljazeeraItem
from datetime import date, datetime
class AljazeeraSpider(Spider):
name = "aljazeera"
allowed_domains = ["aljazeera.com"]
start_urls = [
"http://www.aljazeera.com/news/europe/",
"http://www.aljazeera.com/news/middleeast/",
"http://www.aljazeera.com/news/asia/",
"http://www.aljazeera.com/news/asia-pacific/",
"http://www.aljazeera.com/news/americas/",
"http://www.aljazeera.com/news/africa/",
"http://blogs.aljazeera.com/"
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//td[@valign="bottom"]')
contents = sel.xpath('//div[@class="indexSummaryText"]')
items = []
for site,content in zip(sites, contents):
item = AljazeeraItem()
item['headline'] = site.xpath('div[3]/text()').extract()
item['content'] = site.xpath('div/a/text()').extract()
item['date'] = str(date.today())
for headline, content in zip(item['content'], item['headline']):
item['headline'] = headline.strip()
item['content'] = content.strip()
items.append(item)
return items
我的pipeline.py
代码如下:
import sys
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
import json
import os.path
class SQLStore(object):
def __init__(self):
self.conn = MySQLdb.connect(user='root', passwd='', db='aj_db', host='localhost', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
#log data to json file
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO scraped_data(headlines, contents, dates) VALUES (%s, %s, %s)""", (item['headline'].encode('utf-8'), item['content'].encode('utf-8'), item['date'].encode('utf-8')))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
#log runs into back file
class JsonWriterPipeline(object):
def __init__(self):
self.file = open('backDataOfScrapes.json', "w")
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write("item === " + line)
return item
而settings.py
的内容如下:
BOT_NAME = 'Aljazeera'
SPIDER_MODULES = ['Aljazeera.spiders']
NEWSPIDER_MODULE = 'Aljazeera.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Aljazeera (+http://www.yourdomain.com)'
ITEM_PIPELINES = {
'Aljazeera.pipelines.JsonWriterPipeline': 300,
'Aljazeera.pipelines.SQLStore': 300,
}
我的SQL设置都没问题。在运行scrapy crawl aljazeera
后,它能正常工作,甚至以json格式输出数据,如下所示:
item === {"headline": "Turkey court says Twitter ban violates rights", "content": "Although ruling by Turkey's highest court is binding, it is unclear whether the government will overturn the ban.", "date": "2014-04-02"}
我真的不知道自己缺少了什么。如果你们能帮我一下,我将非常感激。
谢谢你们的时间,
1 个回答
2
你的SQLStore管道的缩进有问题。我用正确的缩进测试过,运行得很好。把下面的代码复制过去,应该就没问题了。
class SQLStore(object):
def __init__(self):
self.conn = MySQLdb.connect(user='root', passwd='', db='aj_db', host='localhost', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
#log data to json file
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO scraped_data(headlines, contents, dates) VALUES (%s, %s, %s)""", (item['headline'].encode('utf-8'), item['content'].encode('utf-8'), item['date'].encode('utf-8')))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item