如何使用scrapy进入下一章粉丝小说网？

<select id="chap_select" title="Chapter Navigation" name="chapter" onchange="self.location = '/s/13109220/'+ this.options[this.selectedIndex].value + '/Son-of-the-Hunt';"> <option value="1" selected="">1. Chapter 1</option> <option value="2">2. Camp</option> <option value="3">3. Chapter 3</option> </select>

import scrapy, docx, time import subprocess as sp class FanfictionDownloader(scrapy.Spider): name = "fanfiction" storyNum = 0 nextPage = False urls = [] docText = '' title = '' def start_requests(self): sp.call('cls', shell=True) self.urls = list(str(input("Enter url seperated by a comma and space (, ): ")).split(', ')) for url in self.urls: if self.urls[self.storyNum].startswith('https://www.fanfiction.net/s/'): yield scrapy.Request(url=url, callback=self.fanfiction) elif self.urls[self.storyNum].startswith('https://www.wattpad.com/'): yield scrapy.Request(url=url, callback=self.wattpad) else: print('Not a valid link, ending downloader.') time.sleep(5) quit() sp.call('cls', shell=True) def fanfiction(self, response): self.storyNum += 1 doc = docx.Document() chapters = '' totalChapters = 0 currentChapter = 1 i = 0 for para in response.css('div#storytext > p'): text = (para.xpath('text() | */text() | */*/text()').getall()) self.title = (response.xpath('//*[@id="profile_top"]/b/text()').get()) storyId = ((response.xpath('//*[@id="profile_top"]/span[4]/text()[5]').get()).replace(' - id: ', '')) chapters = (response.xpath('//*[@id="chap_select"]/option/text()').getall()) totalChapters = len(chapters[0:int(len(chapters) / 2)]) finalText = [ [x.replace('\u00ef', 'ï').replace('\u2013', '–').replace('\u2026', '...') for x in text], ['Story %s: %s' % (self.storyNum, self.urls[self.storyNum - 1])], ['Title: %s' % (self.title)], ['Story ID: %s' % (storyId)], ['Total Chapters: %s' % (totalChapters)], ['Chapter Names: %s' % [chapters[0:int(len(chapters) / 2)]]], ] if len(finalText[0][0]) > 1: self.docText = (''.join(finalText[0][0:])) else: self.docText = finalText[0][0] if self.nextPage == False: doc.add_paragraph(self.docText) else: doc.add_page_break(self.docText) self.nextPage = False doc.add_paragraph() sp.call('cls', shell=True) doc.save('./../%s.docx' % (self.title)) i += 1 yield {'line ' + str(i): finalText} sp.call('cls', shell=True) def wattpad(self, response): pass

1条回答

网友

1楼 · 发布于 2024-06-02 08:42:08

你真的需要为这个故事保留一个计数器吗？你知道吗

我想你只要找到下一页就可以了，比如：

if response.xpath('//button[text()="Next >"]'):
    next_link = response.xpath('//button[text()="Next >"]')[0].attrib['onclick'].replace('self.location=', '').replace("'", '')
    yield response.follow('https://www.fanfiction.net' + next_link, self.fanfiction)

正如评论中提到的，您应该使用项目管道来关心在文档中“存储”您的项目。你知道吗

这里有一些东西可以给你一个想法，这对我很有用，你必须适应你的用例：

import docx
import scrapy

class StoryPipeline:

    def open_spider(self, spider):
        self.doc = docx.Document()

    def process_item(self, item, spider):
        if 'title' in item:
            self.title = item['title']
            self.doc.add_paragraph(str(item))
        else:
            self.doc.add_paragraph('\n\n'.join(item['paragraphs']))

    def close_spider(self, spider):
        self.doc.save('./%s.docx' % (self.title))


class FanfictionDownloader(scrapy.Spider):

    name = "fanfiction.net"

    custom_settings = {
        "ITEM_PIPELINES": {
            "myspider.StoryPipeline": 300,
        }
    }

    def start_requests(self):
        start_url = 'https://www.fanfiction.net/s/11734723/1/This-Past-Storm'
        yield scrapy.Request(url=start_url, callback=self.parse)

    def parse(self, response):

        title = response.xpath('//*[@id="profile_top"]/b/text()').get()
        storyId = response.xpath('//*[@id="profile_top"]/span[4]/text()[5]').get().replace(' - id: ', '')
        chapters = response.xpath('(//select[@id="chap_select"])[1]/option/text()').getall()

        yield {
            'title': title,
            'storyId': storyId,
            'chapters': chapters,
            'totalChapters': len(chapters),
        }

        for x in self._parse_paragraphs(response):
            yield x

    def parse_next(self, response):

        for x in self._parse_paragraphs(response):
            yield x

    def _parse_paragraphs(self, response):
        paragraphs = response.xpath('//div[@id="storytext"]//text()').getall()

        yield {'paragraphs': paragraphs}

        next_button = response.xpath('(//button[text()="Next >"])[1]/@onclick').get()
        if next_button:
            next_url = next_button.replace('self.location=', '').replace("'", '')
            yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_next)

相关问题更多 >

编程相关推荐

热门问题

热门文章