如何从python中的scrapy输出中删除“\n”

from scrapy.spiders import Spider from scrapy.selector import Selector from scrapy.http import Request from scrapingtest.items import ScrapingTestingItem from collections import OrderedDict import json from scrapy.selector.lxmlsel import HtmlXPathSelector import csv import html2text import unicodedata class scrapingtestspider(Spider): name = "scrapytesting" allowed_domains = ["tripadvisor.in"] base_uri = ["tripadvisor.in"] start_urls = [ "http://www.tripadvisor.in/Hotel_Review-g297679-d736080-Reviews-Ooty_Elk_Hill_A_Sterling_Holidays_Resort-Ooty_Tamil_Nadu.html"] def parse(self, response): item = ScrapingTestingItem() sel = HtmlXPathSelector(response) converter = html2text.HTML2Text() sites = sel.xpath('//a[contains(text(), "Next")]/@href').extract() ## dummy_test = [ "" for k in range(10)] item['reviews'] = sel.xpath('//div[@class="col2of2"]//p[@class="partial_entry"]/text()').extract() item['subjects'] = sel.xpath('//span[@class="noQuotes"]/text()').extract() item['stars'] = sel.xpath('//*[@class="rating reviewItemInline"]//img/@alt').extract() item['names'] = sel.xpath('//*[@class="username mo"]/span/text()').extract() item['location'] = sel.xpath('//*[@class="location"]/text()').extract() item['date'] = sel.xpath('//*[@class="ratingDate relativeDate"]/@title').extract() item['date'] += sel.xpath('//div[@class="col2of2"]//span[@class="ratingDate"]/text()').extract() startingrange = len(sel.xpath('//*[@class="ratingDate relativeDate"]/@title').extract()) for j in range(startingrange,len(item['date'])): item['date'][j] = item['date'][j][9:].strip() for i in range(len(item['stars'])): item['stars'][i] = item['stars'][i][:1].strip() for o in range(len(item['reviews'])): print unicodedata.normalize('NFKD', unicode(item['reviews'][o])).encode('ascii', 'ignore') for y in range(len(item['subjects'])): item['subjects'][y] = unicodedata.normalize('NFKD', unicode(item['subjects'][y])).encode('ascii', 'ignore') yield item # print item['reviews'] if(sites and len(sites) > 0): for site in sites: yield Request(url="http://tripadvisor.in" + site, callback=self.parse)

2条回答

网友

1楼 · 编辑于 2024-05-16 08:33:23

我通常对输出进行修剪和清理的方法是将Input and/or Output Processors与Item Loaders一起使用，这样可以使事情更加模块化和干净：

class ScrapingTestingLoader(ItemLoader):
    default_input_processor = MapCompose(unicode.strip)
    default_output_processor = TakeFirst()

然后，如果使用此项加载器加载项，则提取的值将被剥离并作为字符串（而不是列表）显示。例如，如果提取的字段是["my value \n"]-您将获得my value作为输出。

网友

2楼 · 编辑于 2024-05-16 08:33:23

阅读列表文档后的简单解决方案。

while "\n" in some_list: some_list.remove("\n")

相关问题更多 >

编程相关推荐

热门问题

热门文章