MongoDB InvalidDocument:无法对obj进行编码

# -*- coding: utf-8 -*- # Define your item pipelines here # import pymongo import sys, traceback from scrapy.exceptions import DropItem from crawler.items import BlogItem, CommentItem class MongoPipeline(object): collection_name = 'master' def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DATABASE', 'posts') ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self, spider): self.client.close() def process_item(self, item, spider): if type(item) is BlogItem: try: if 'url' in item: item['url'] = item['url'].encode('utf-8', 'strict') if 'domain' in item: item['domain'] = item['domain'].encode('utf-8', 'strict') if 'title' in item: item['title'] = item['title'].encode('utf-8', 'strict') if 'date' in item: item['date'] = item['date'].encode('utf-8', 'strict') if 'content' in item: item['content'] = item['content'].encode('utf-8', 'strict') if 'author' in item: item['author'] = item['author'].encode('utf-8', 'strict') except: # catch *all* exceptions e = sys.exc_info()[0] spider.logger.critical("ERROR ENCODING %s", e) traceback.print_exc(file=sys.stdout) raise DropItem("Error encoding BLOG %s" % item['url']) if 'comments' in item: comments = item['comments'] item['comments'] = [] try: for comment in comments: if 'date' in comment: comment['date'] = comment['date'].encode('utf-8', 'strict') if 'author' in comment: comment['author'] = comment['author'].encode('utf-8', 'strict') if 'content' in comment: comment['content'] = comment['content'].encode('utf-8', 'strict') item['comments'].append(comment) except: # catch *all* exceptions e = sys.exc_info()[0] spider.logger.critical("ERROR ENCODING COMMENT %s", e) traceback.print_exc(file=sys.stdout) self.db[self.collection_name].insert(dict(item)) return item

au coeur de l\u2019explosion de la bulle Internet n\u2019est probablement pas \xe9tranger au succ\xe8s qui a suivi. Mais franchement, c\u2019est un peu court comme argument !Ce que je sais dire, compte tenu de ce qui pr\xe9c\xe8de, c\u2019est quelles sont les conditions pour r\xe9ussir si l\u2019on est vraiment contraint de rester en France. Ce sont des sujets que je d\xe9velopperai dans un autre article.', 'date': u'2012-06-27T23:21:25+00:00', 'domain': 'reussir-sa-boite.fr', 'title': u'Peut-on encore entreprendre en France ?\t\t\t ', 'url': 'http://www.reussir-sa-boite.fr/peut-on-encore-entreprendre-en-france/'} Traceback (most recent call last): File "h:\program files\anaconda\lib\site-packages\twisted\internet\defer.py", line 588, in _runCallbacks current.result = callback(current.result, *args, **kw) File "H:\PDS\BNP\crawler\crawler\pipelines.py", line 76, in process_item self.db[self.collection_name].insert(dict(item)) File "h:\program files\anaconda\lib\site-packages\pymongo\collection.py", line 409, in insert gen(), check_keys, self.uuid_subtype, client) InvalidDocument: Cannot encode object: {'author': 'Arnaud Lemasson', 'content': 'Tellement vrai\xe2\x80\xa6 Il faut vraiment \xc3\xaatre motiv\xc3\xa9 aujourd\xe2\x80\x99hui pour monter sa bo\xc3\xaete. On est pr\xc3\xa9lev\xc3\xa9 de partout, je ne pense m\xc3\xaame pas \xc3\xa0 embaucher, cela me co\xc3\xbbterait bien trop cher. Bref, 100% d\xe2\x80\x99accord avec vous. Le probl\xc3\xa8me, je ne vois pas comment cela pourrait changer avec le gouvernement actuel\xe2\x80\xa6 A moins que si, j\xe2\x80\x99ai pu lire il me semble qu\xe2\x80\x99ils avaient en t\xc3\xaate de r\xc3\xa9duire l\xe2\x80\x99IS pour les petites entreprises et de l\xe2\x80\x99augmenter pour les grandes\xe2\x80\xa6 A voir', 'date': '2012-06-27T23:21:25+00:00'} 2015-11-04 15:29:15 [scrapy] INFO: Closing spider (finished) 2015-11-04 15:29:15 [scrapy] INFO: Dumping Scrapy stats: {'downloader/request_bytes': 259, 'downloader/request_count': 1, 'downloader/request_method_count/GET': 1, 'downloader/response_bytes': 252396, 'downloader/response_count': 1, 'downloader/response_status_count/200': 1, 'finish_reason': 'finished', 'finish_time': datetime.datetime(2015, 11, 4, 14, 29, 15, 701000), 'log_count/DEBUG': 2, 'log_count/ERROR': 1, 'log_count/INFO': 7, 'response_received_count': 1, 'scheduler/dequeued': 1, 'scheduler/dequeued/memory': 1, 'scheduler/enqueued': 1, 'scheduler/enqueued/memory': 1, 'start) time': datetime.datetime(2015, 11, 4, 14, 29, 13, 191000)}

>>> s = "Tellement vrai\xe2\x80\xa6 Il faut vraiment \xc3\xaatre motiv\xc3\xa9 aujourd\xe2\x80\x99hui pour monter sa bo\xc3\xaete. On est pr\xc3\xa9lev\xc3\xa9 de partout, je ne pense m\xc3\xaame pas \xc3\xa0 embaucher, cela me" >>> s 'Tellement vrai\xe2\x80\xa6 Il faut vraiment \xc3\xaatre motiv\xc3\xa9 aujourd\xe2\x80\x99hui pour monter sa bo\xc3\xaete. On est pr\xc3\xa9lev\xc3\xa9 de partout, je ne pense m\xc3\xaame pas \xc3\xa0 embaucher, cela me' >>> se = s.encode("utf8", "strict") Traceback (most recent call last): File "<stdin>", line 1, in <module> UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 14: ordinal not in range(128) >>> se = s.encode("utf-8", "strict") Traceback (most recent call last): File "<stdin>", line 1, in <module> UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 14: ordinal not in range(128) >>> s.decode() Traceback (most recent call last): File "<stdin>", line 1, in <module> UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 14: ordinal not in range(128)

3条回答

网友
1楼 · 编辑于 2024-05-16 01:15:37

我终于明白了。问题不在于编码。这是文件的结构。
因为我开始使用标准的MongoPipeline示例，它不处理嵌套的垃圾项。
我要做的是：博客项目： “网址” ... 注释=[注释项]
所以我的BlogItem有一个CommentItems列表。现在问题来了，在数据库中持久化对象我做到了：
self.db[self.collection_name].insert(dict(item))
所以在这里，我将BlogItem解析为dict，但不解析CommentItems列表。因为回溯显示的CommentItem有点像dict，所以我没有想到有问题的对象不是dict！
因此，最后解决此问题的方法是在将注释附加到注释列表时更改行，如下所示：
item['comments'].append(dict(comment))
现在MongoDB认为它是一个有效的文档。
最后，在最后一部分，我问为什么在python控制台而不是脚本中会出现异常。
原因是我正在使用python控制台，它只支持ascii。也就是错误。

网友
2楼 · 编辑于 2024-05-16 01:15:37

首先，当您执行"somestring".encode(...)时，不会更改"somestring"，但它会返回一个新的编码字符串，因此您应该使用如下内容：
item['author'] = item['author'].encode('utf-8', 'strict')
其他领域也一样。

网友
3楼 · 编辑于 2024-05-16 01:15:37

运行查询时出现此错误

db.collection.find({'attr': {'$gte': 20}})

而collection中的某些记录具有attr的非数值值。

相关问题更多 >

编程相关推荐

热门问题

热门文章