Scrapy只生成最后一个数据，并将Scrapy数据合并到on中

import scrapy from ..items import ScrapedItem class CBNCSpider(scrapy.Spider): name = 'kontan' start_urls = [ 'https://investasi.kontan.co.id/rubrik/28/Emiten' ] def parse(self, response): box_text = response.xpath("//ul/li/div[@class='ket']") items = ScrapedItem() for crawl in box_text: title = crawl.css("h1 a::text").extract() source ="https://investasi.kontan.co.id"+(crawl.css("h1 a::attr(href)").extract()[0]) date = crawl.css("span.font-gray::text").extract()[0].replace("|","") items['title'] = title items['source'] =source items['date'] = date yield scrapy.Request(url = source, callback=self.parseparagraph, meta={'item':items}) def parseparagraph(self, response): items_old = response.meta['item'] #only last item stored paragraph = response.xpath("//p/text()").extract() items_old['paragraph'] = paragraph #merge into single string yield items_old

1条回答

网友

1楼 · 发布于 2024-04-19 22:49:21

我定义了一个空字典并把那些变量放在里面。此外，我还对xpath和css选择器进行了一些小的更改，使它们不易出错。脚本正在按预期工作：

import scrapy

class CBNCSpider(scrapy.Spider):
    name = 'kontan'
    start_urls = [
        'https://investasi.kontan.co.id/rubrik/28/Emiten'
    ]

    def parse(self, response):
        for crawl in response.xpath("//*[@id='list-news']//*[@class='ket']"):
            d = {}
            d['title'] = crawl.css("h1 > a::text").get()
            d['source'] = response.urljoin(crawl.css("h1 > a::attr(href)").get())
            d['date'] = crawl.css("span.font-gray::text").get().strip("|")
            yield scrapy.Request(
                url=d['source'],
                callback=self.parseparagraph,
                meta={'item':d}
             )

    def parseparagraph(self, response):
        items_old = response.meta['item']
        items_old['paragraph'] = response.xpath("//p/text()").getall()
        yield items_old

相关问题更多 >

编程相关推荐

热门问题

热门文章