刮曙光新闻网站返回(推荐人：无)

import os import scrapy newpath = 'urdu_data' if not os.path.exists(newpath): os.makedirs(newpath) class UrduSpider(scrapy.Spider): name = "urdu" start_urls = [ 'https://www.dawnnews.tv', 'https://www.dawnnews.tv/latest-news' 'https://www.dawnnews.tv/news' 'https://www.dawnnews.tv/tech' ] def should_process_page(self, page_url): for s_url in self.start_urls: if page_url.startswith(s_url) and page_url != s_url: return True return False def parse(self, response): if self.should_process_page(response.url): page_id = response.url.split("/")[-1] filename = page_id + '.txt' # if response has story body, we save it's contents story_body = response.css('div.story__content') story_paragraphs_text = story_body.css('p::text') page_data = '' for p in story_paragraphs_text: page_data += p.extract() + '\n' if page_data: open('urdu_data/' + filename, 'w').write(page_data) # Now follow any links that are present on the page links = response.css('a.title-link ::attr(href)').extract() for link in links: yield scrapy.Request( response.urljoin(link), callback=self.parse )

1条回答

网友

1楼 · 发布于 2024-06-06 15:58:43

我想你需要像下面这样的起始网址

start_urls = [
        'https://www.dawnnews.tv',
        'https://www.dawnnews.tv/latest-news',
        'https://www.dawnnews.tv/news',
        'https://www.dawnnews.tv/tech'
    ]

在上面提到的代码中，你没有用逗号分隔的url，所以它只需要两个url第一个url和其他三个url被附加并用作一个url请在上面提到的每个url后面加上逗号

下一步story_body = response.css('div.story__content')意味着在url给出的页面中应该有一个具有class=story\uu内容的div元素，我认为在上面提到的内容中缺少这个元素网址。只是快速浏览一下https://www.dawnnews.tv的html，它似乎有类似story_uuextract as div class的内容，不确定这是否是您所需要的需要。无论如何你需要检查这些页面的html找到正确的内容。在

要调试它，您可以使用print语句并打印出story_body、story_paragrations_text并检查您是否获得了这些。希望这将有助于您完成所需的调试

^{pr2}$

上述代码

import os
import scrapy


newpath = 'urdu_data' 
if not os.path.exists(newpath):
    os.makedirs(newpath)


class UrduSpider(scrapy.Spider):
    name = "urdu"
    start_urls = [
        'https://www.dawnnews.tv',
        'https://www.dawnnews.tv/latest-news',
        'https://www.dawnnews.tv/news',
        'https://www.dawnnews.tv/tech'
    ]

    def should_process_page(self, page_url):



        for s_url in self.start_urls:
            if page_url.startswith(s_url) and page_url != s_url:
                return True

        return False

    def parse(self, response):
        print(response.url)
        if self.should_process_page(response.url):
            page_id = response.url.split("/")[-1]
            filename = page_id + '.txt'
            print(filename)

            # if response has story body, we save it's contents
            story_body = response.css('div.story__excerpt')
            print(story_body)
            story_paragraphs_text = story_body.css('p::text')
            page_data = ''
            for p in story_paragraphs_text:
                page_data += p.extract() + '\n'

            if page_data:
                open('urdu_data/' + filename, 'w').write(page_data)

            # Now follow any links that are present on the page
            links = response.css('a.title-link ::attr(href)').extract()
            for link in links:
                yield scrapy.Request(
                    response.urljoin(link),
                    callback=self.parse
                )

您需要进行类似的更改，以便根据页面的html结构从其他元素获取响应。在

要调试它，您可以使用print语句并打印出story_body、story_paragrations_text并检查您是否获得了这些。希望这将有助于您完成所需的调试

相关问题更多 >

编程相关推荐

热门问题

热门文章