在粉丝小说.net,这是获取故事章节的HTML代码:
<select id="chap_select" title="Chapter Navigation" name="chapter" onchange="self.location = '/s/13109220/'+ this.options[this.selectedIndex].value + '/Son-of-the-Hunt';">
<option value="1" selected="">1. Chapter 1</option>
<option value="2">2. Camp</option>
<option value="3">3. Chapter 3</option>
</select>
我想用它继续下载下一章的文本内容,但是通常的方法是递归调用self.fanfiction()
,因为self.storyNum += 1
行的缘故,这种方法不起作用。你知道吗
import scrapy, docx, time
import subprocess as sp
class FanfictionDownloader(scrapy.Spider):
name = "fanfiction"
storyNum = 0
nextPage = False
urls = []
docText = ''
title = ''
def start_requests(self):
sp.call('cls', shell=True)
self.urls = list(str(input("Enter url seperated by a comma and space (, ): ")).split(', '))
for url in self.urls:
if self.urls[self.storyNum].startswith('https://www.fanfiction.net/s/'):
yield scrapy.Request(url=url, callback=self.fanfiction)
elif self.urls[self.storyNum].startswith('https://www.wattpad.com/'):
yield scrapy.Request(url=url, callback=self.wattpad)
else:
print('Not a valid link, ending downloader.')
time.sleep(5)
quit()
sp.call('cls', shell=True)
def fanfiction(self, response):
self.storyNum += 1
doc = docx.Document()
chapters = ''
totalChapters = 0
currentChapter = 1
i = 0
for para in response.css('div#storytext > p'):
text = (para.xpath('text() | */text() | */*/text()').getall())
self.title = (response.xpath('//*[@id="profile_top"]/b/text()').get())
storyId = ((response.xpath('//*[@id="profile_top"]/span[4]/text()[5]').get()).replace(' - id: ', ''))
chapters = (response.xpath('//*[@id="chap_select"]/option/text()').getall())
totalChapters = len(chapters[0:int(len(chapters) / 2)])
finalText = [
[x.replace('\u00ef', 'ï').replace('\u2013', '–').replace('\u2026', '...') for x in text],
['Story %s: %s' % (self.storyNum, self.urls[self.storyNum - 1])],
['Title: %s' % (self.title)],
['Story ID: %s' % (storyId)],
['Total Chapters: %s' % (totalChapters)],
['Chapter Names: %s' % [chapters[0:int(len(chapters) / 2)]]],
]
if len(finalText[0][0]) > 1:
self.docText = (''.join(finalText[0][0:]))
else:
self.docText = finalText[0][0]
if self.nextPage == False:
doc.add_paragraph(self.docText)
else:
doc.add_page_break(self.docText)
self.nextPage = False
doc.add_paragraph()
sp.call('cls', shell=True)
doc.save('./../%s.docx' % (self.title))
i += 1
yield {'line ' + str(i): finalText}
sp.call('cls', shell=True)
def wattpad(self, response):
pass
你真的需要为这个故事保留一个计数器吗?你知道吗
我想你只要找到下一页就可以了,比如:
正如评论中提到的,您应该使用项目管道来关心在文档中“存储”您的项目。你知道吗
这里有一些东西可以给你一个想法,这对我很有用,你必须适应你的用例:
相关问题 更多 >
编程相关推荐