使用Scrapy刮取时，某些“非常用字符”编码错误

def parse(self, response): base_link = 'http://www.the-numbers.com' rows_in_big_table = response.xpath("//table/tr") movie_name = onerow.xpath('td/b/a/text()').extract()[0] movie_item['movie_name'] = movie_name yield movie_budget_item next_page = response.xpath('//div[@class="pagination"]/a[@class="active"]/following- sibling::a/@href').get() if next_page is not None: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse)

1条回答

网友

1楼 · 发布于 2024-04-24 06:53:55

使用编码ISO-8859-1

import scrapy
from bad_encoding.items import BadEncodingItem


class MoviesSpider(scrapy.Spider):
    name = 'movies'
    allowed_domains = ['www.the-numbers.com']
    start_urls = [
        'https://www.the-numbers.com/box-office-records/domestic/all-movies/cumulative/all-time/301'
    ]

    custom_settings = {'FEED_EXPORT_ENCODING': 'ISO-8859-1'}

    def parse(self, response):
        for row in response.xpath('//table/tbody/tr'):
            items = BadEncodingItem()
            items['Rank'] = row.xpath('.//td[1]/text()').get()
            items['Released'] = row.xpath('.//td[2]/a/text()').get()
            items['Movie'] = row.xpath('.//td[3]/b/a/text()').get()
            items['Domestic'] = row.xpath('.//td[4]/text()').get()
            items['International'] = row.xpath('.//td[5]/text()').get()
            items['Worldwide'] = row.xpath('.//td[6]/text()').get()

            yield items

这是我的json文件

相关问题更多 >

编程相关推荐

热门问题

热门文章