我不太明白,当我不断得到索引误差,说超出范围。是因为列表链接是空的吗?这里写链接的格式是什么
class POSpider(CrawlSpider):
name = 'po'
start_urls = ['https://www.poets.org/poetsorg/poems']
allowed_domains = ['poets.org/poetsorg/poems']
def parse(self, response):
items=[]
l=response.xpath('//*[@class="themes"]//a//@href').extract()
theme_ids=[]
for item in l:
theme_ids.append(item[855:1412])
theme_urls=[]
for tid in theme_ids:
theme_urls.append('https://www.poets.org/poetsorg/poems? field_occasion_tid=All&field_poem_themes_tid='+ tid)
for link in theme_urls:
request=scrapy.Request(link,callback=self.parse_layer2,dont_filter=True)
yield request
def parse_layer2(self,response):
items=[]
p=response.xpath('//*[@id="block-views-poems-poems-block-all"]/div/div//tbody//td[2]//@href')[-1].extract()
poem_urls=[]
for item in p:
poem_urls.append(item)
for link in poem_urls:
request=scrapy.Request(link,callback=self.parse_layer3,dont_filter=True)
yield request
def parse_layer3(self,response):
items=[]
poems=response.xpath('//*[@id="poem-content"]/div[2]/div/div').extract()
for poem in poems:
item=PoetryItem()
s=poem.xpath('*/p/text()').extract()
t=strip_list(s)
t=t.encode('ascii','replace').lower()+'\r\n'
item['poem']=t
items.append(item)
return items
这就是我一直得到的结果。在
^{pr2}$
目前没有回答
相关问题 更多 >
编程相关推荐