如何使用scrapy进入下一章粉丝小说网?

2024-06-02 08:42:08 发布

您现在位置:Python中文网/ 问答频道 /正文

在粉丝小说.net,这是获取故事章节的HTML代码:

<select id="chap_select" title="Chapter Navigation" name="chapter" onchange="self.location = '/s/13109220/'+ this.options[this.selectedIndex].value + '/Son-of-the-Hunt';">
  <option value="1" selected="">1. Chapter 1</option>
  <option value="2">2. Camp</option>
  <option value="3">3. Chapter 3</option>
</select>

我想用它继续下载下一章的文本内容,但是通常的方法是递归调用self.fanfiction(),因为self.storyNum += 1行的缘故,这种方法不起作用。你知道吗

import scrapy, docx, time
import subprocess as sp

class FanfictionDownloader(scrapy.Spider):
    name = "fanfiction"
    storyNum = 0
    nextPage = False
    urls = []
    docText = ''
    title = ''

    def start_requests(self):
        sp.call('cls', shell=True)
        self.urls = list(str(input("Enter url seperated by a comma and space (, ): ")).split(', '))
        for url in self.urls:
            if self.urls[self.storyNum].startswith('https://www.fanfiction.net/s/'):
                yield scrapy.Request(url=url, callback=self.fanfiction)
            elif self.urls[self.storyNum].startswith('https://www.wattpad.com/'):
                yield scrapy.Request(url=url, callback=self.wattpad)
            else:
                print('Not a valid link, ending downloader.')
                time.sleep(5)
                quit()
                sp.call('cls', shell=True)

    def fanfiction(self, response):
        self.storyNum += 1
        doc = docx.Document()
        chapters = ''
        totalChapters = 0
        currentChapter = 1
        i = 0
        for para in response.css('div#storytext > p'):
            text = (para.xpath('text() | */text() | */*/text()').getall())
            self.title = (response.xpath('//*[@id="profile_top"]/b/text()').get())
            storyId = ((response.xpath('//*[@id="profile_top"]/span[4]/text()[5]').get()).replace(' - id: ', ''))
            chapters = (response.xpath('//*[@id="chap_select"]/option/text()').getall())
            totalChapters = len(chapters[0:int(len(chapters) / 2)])
            finalText = [
                [x.replace('\u00ef', 'ï').replace('\u2013', '–').replace('\u2026', '...') for x in text],
                ['Story %s: %s' % (self.storyNum,  self.urls[self.storyNum - 1])],
                ['Title: %s' % (self.title)],
                ['Story ID: %s' % (storyId)],
                ['Total Chapters: %s' % (totalChapters)],
                ['Chapter Names: %s' % [chapters[0:int(len(chapters) / 2)]]],
            ]
            if len(finalText[0][0]) > 1:
                self.docText = (''.join(finalText[0][0:]))
            else:
                self.docText = finalText[0][0]
            if self.nextPage == False:
                doc.add_paragraph(self.docText)
            else:
                doc.add_page_break(self.docText)
                self.nextPage = False
                doc.add_paragraph()
            sp.call('cls', shell=True)
            doc.save('./../%s.docx' % (self.title))
            i += 1
            yield {'line ' + str(i): finalText}
            sp.call('cls', shell=True)

    def wattpad(self, response):
        pass

Tags: textselfidurldoctitleresponseurls
1条回答
网友
1楼 · 发布于 2024-06-02 08:42:08

你真的需要为这个故事保留一个计数器吗?你知道吗

我想你只要找到下一页就可以了,比如:

if response.xpath('//button[text()="Next >"]'):
    next_link = response.xpath('//button[text()="Next >"]')[0].attrib['onclick'].replace('self.location=', '').replace("'", '')
    yield response.follow('https://www.fanfiction.net' + next_link, self.fanfiction)

正如评论中提到的,您应该使用项目管道来关心在文档中“存储”您的项目。你知道吗

这里有一些东西可以给你一个想法,这对我很有用,你必须适应你的用例:

import docx
import scrapy

class StoryPipeline:

    def open_spider(self, spider):
        self.doc = docx.Document()

    def process_item(self, item, spider):
        if 'title' in item:
            self.title = item['title']
            self.doc.add_paragraph(str(item))
        else:
            self.doc.add_paragraph('\n\n'.join(item['paragraphs']))

    def close_spider(self, spider):
        self.doc.save('./%s.docx' % (self.title))


class FanfictionDownloader(scrapy.Spider):

    name = "fanfiction.net"

    custom_settings = {
        "ITEM_PIPELINES": {
            "myspider.StoryPipeline": 300,
        }
    }

    def start_requests(self):
        start_url = 'https://www.fanfiction.net/s/11734723/1/This-Past-Storm'
        yield scrapy.Request(url=start_url, callback=self.parse)

    def parse(self, response):

        title = response.xpath('//*[@id="profile_top"]/b/text()').get()
        storyId = response.xpath('//*[@id="profile_top"]/span[4]/text()[5]').get().replace(' - id: ', '')
        chapters = response.xpath('(//select[@id="chap_select"])[1]/option/text()').getall()

        yield {
            'title': title,
            'storyId': storyId,
            'chapters': chapters,
            'totalChapters': len(chapters),
        }

        for x in self._parse_paragraphs(response):
            yield x

    def parse_next(self, response):

        for x in self._parse_paragraphs(response):
            yield x

    def _parse_paragraphs(self, response):
        paragraphs = response.xpath('//div[@id="storytext"]//text()').getall()

        yield {'paragraphs': paragraphs}

        next_button = response.xpath('(//button[text()="Next >"])[1]/@onclick').get()
        if next_button:
            next_url = next_button.replace('self.location=', '').replace("'", '')
            yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_next)

相关问题 更多 >