Scrapy 实现分页和下一页
下一页按钮在点击时并没有改变网址,所以我在使用scrapy的时候遇到了问题。
'''
import scrapy
class LegonSpider(scrapy.Spider):
name = "legon"
def start_requests(self):
yield scrapy.Request(
url="https://mylegion.org/PersonifyEbusiness/Find-a-Post",
callback=self.parse
)
def parse(self, response):
# Select distance and country
yield scrapy.FormRequest.from_response(
response,
formid='aspnetForm',
formdata={'dnn$ctr2802$DNNWebControlContainer$ctl00$DistanceList': '100',
'@IP_COUNTRY': 'USA',
'@IP_DEPARTMENT': '00000000001L'},
callback=self.parse_post_page
)
def parse_post_page(self, response):
# Extract and yield requests for post detail pages
post_elements = response.xpath("//div[@class='membership-dir-result-item']")
for post_element in post_elements:
post_num = post_element.xpath(".//div[contains(@class,'POST_NAME')]/text()").get().strip()
post_link = post_element.xpath("./a/@href").get()
yield response.follow(post_link, callback=self.parse_post_detail, meta={'post_num': post_num})
next_page_button = response.xpath("/input[@id='dnn_ctr2802_DNNWebControlContainer_ctl00_Next']")
if next_page_button:
# Extract form data for next page submission
formdata = {
'__EVENTTARGET': 'dnn$ctr2802$DNNWebControlContainer$ctl00$Next',
'__EVENTARGUMENT': ''
}
yield scrapy.FormRequest.from_response(response, formdata=formdata, callback=self.parse_post_page)
def parse_post_detail(self,response):
leader1 = response.xpath("(//div[contains(@class,'Leadership')]/div[2]/text())[1]").get()
leader2 = response.xpath("(//div[contains(@class,'Leadership')]/div[2]/text())[2]").get()
address = response.xpath("//div[contains(@class,'Address')]/div[2]/text()").get()
typ = response.xpath("//div[contains(@class,'Type')]/div[2]/text()").get()
yield {
"post_num": response.meta['post_num'],
"leader1": leader1,
"leader2": leader2,
"address": address,
"type" : typ
}
我觉得scrapy根本没有去下一页,它一直停留在基础网址上,而这个网址在我点击下一页或者尝试使用新的搜索方法时都没有变化。
1 个回答
0
当我查看回应时,发现我总是得到同样的页面。
如果我们用BurpSuite来检查请求并进行比较,就能看到这一部分:

你可以在右边看到“Next”的值,但如果我们检查回应中的表单数据,就会发现这个值缺失了。我们只需要把它加上:
import scrapy
class LegonSpider(scrapy.Spider):
name = "legon"
def start_requests(self):
yield scrapy.Request(
url="https://mylegion.org/PersonifyEbusiness/Find-a-Post",
callback=self.parse
)
def parse(self, response):
# Select distance and country
yield scrapy.FormRequest.from_response(
response,
formid='aspnetForm',
formdata={'dnn$ctr2802$DNNWebControlContainer$ctl00$DistanceList': '100',
'@IP_COUNTRY': 'USA',
'@IP_DEPARTMENT': '00000000001L'},
callback=self.parse_post_page
)
def parse_post_page(self, response):
post_elements = response.xpath("//div[@class='membership-dir-result-item']")
for post_element in post_elements:
post_num = post_element.xpath(".//div[contains(@class,'POST_NAME')]/text()").get().strip()
post_link = post_element.xpath("./a/@href").get()
yield response.follow(post_link, callback=self.parse_post_detail, meta={'post_num': post_num})
next_page_button = response.xpath("//input[@id='dnn_ctr2802_DNNWebControlContainer_ctl00_Next']")
if next_page_button:
form_data = {'dnn$ctr2802$DNNWebControlContainer$ctl00$Next': 'Next'}
yield scrapy.FormRequest.from_response(response, formdata=form_data, callback=self.parse_post_page)
def parse_post_detail(self, response):
leader1 = response.xpath("(//div[contains(@class,'Leadership')]/div[2]/text())[1]").get()
leader2 = response.xpath("(//div[contains(@class,'Leadership')]/div[2]/text())[2]").get()
address = response.xpath("//div[contains(@class,'Address')]/div[2]/text()").get()
typ = response.xpath("//div[contains(@class,'Type')]/div[2]/text()").get()
yield {
"post_num": response.meta['post_num'],
"leader1": leader1,
"leader2": leader2,
"address": address,
"type": typ
}
看看我的表单数据和你的有什么不同。
顺便说一下,你在next_page_button
的选择器中漏了一个/
。