我对Scrapy
非常陌生,我尝试从这个website中的每个页面获取表数据
这是我的代码:
import scrapy
class UAESpider(scrapy.Spider):
name = 'uae_free'
allowed_domains = ['https://www.uaeonlinedirectory.com']
start_urls = [
'https://www.uaeonlinedirectory.com/UFZOnlineDirectory.aspx?item=A'
]
def parse(self, response):
pages = response.xpath('//table[@class="GridViewStyle"]//tr[12]')
for page in pages[1:11]:
rows = page.xpath('//table[@class="GridViewStyle"]//tr')
for row in rows[1:11]:
yield {
'company_name': row.xpath('.//td[2]//text()').get(),
'company_name_link': row.xpath('.//td[2]//a/@href').get(),
'zone': row.xpath('.//td[4]//text()').get(),
'category': row.xpath('.//td[6]//text()').get(),
'category_link': row.xpath('.//td[6]//a/@href').get()
}
next_page = response.xpath('//table[@class="GridViewStyle"]//tr[12]//td[11]//a/@href').get()
if next_page:
yield scrapy.Request(url=next_page, callback=self.parse)
但它不起作用,我得到这个错误,下面的URL是指向page 11
的链接:
ValueError: Missing scheme in request url: javascript:__doPostBack('ctl00$ContentPlaceHolder2$grdDirectory','Page$11')
你们知道怎么修复这个错误吗
更新:
按照@zmike建议的这个answer的指示,这是我到目前为止所做的:
import scrapy
from scrapy.http import FormRequest
URL = 'https://www.uaeonlinedirectory.com/UFZOnlineDirectory.aspx?item=A'
class UAESpider(scrapy.Spider):
name = 'uae_free'
allowed_domains = ['https://www.uaeonlinedirectory.com/UFZOnlineDirectory.aspx?item=A']
start_urls = [
'https://www.uaeonlinedirectory.com/UFZOnlineDirectory.aspx?item=A'
]
def parse(self, response):
self.data = {}
for form_input in response.css('form#aspnetForm input'):
name = form_input.xpath('@name').extract()[0]
try:
value = form_input.xpath('@value').extract()[0]
except IndexError:
value = ""
self.data[name] = value
self.data['ctl00_ContentPlaceHolder2_panelGrid'] = 'ctl00$ContentPlaceHolder2$grdDirectory'
self.data['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder2$grdDirectory'
self.data['__EVENTARGUMENT'] = 'Page$1'
return FormRequest(url=URL,
method='POST',
callback=self.parse_page,
formdata=self.data,
meta={'page':1},
dont_filter=True)
def parse_page(self, response):
current_page = response.meta['page'] + 1
rows = response.xpath('//table[@class="GridViewStyle"]//tr')
for row in rows[1:11]:
yield {
'company_name': row.xpath('.//td[2]//text()').get(),
'company_name_link': row.xpath('.//td[2]//a/@href').get(),
'zone': row.xpath('.//td[4]//text()').get(),
'category': row.xpath('.//td[6]//text()').get(),
'category_link': row.xpath('.//td[6]//a/@href').get()
}
return FormRequest(url=URL,
method='POST',
formdata={
'__EVENTARGUMENT': 'Page$%d' % current_page,
'__EVENTTARGET': 'ctl00$ContentPlaceHolder2$grdDirectory',
'ctl00_ContentPlaceHolder2_panelGrid':'ctl00$ContentPlaceHolder2$grdDirectory',
'':''},
meta={'page': current_page},
dont_filter=True)
这段代码只从第一个页面获取表数据,它不会移动到剩余页面。你知道我哪里做错了吗
下面是一个正在运行的(尽管很粗糙)爬虫实现,它可以遍历所有页面。一些注意事项:
__EVENTTARGET
、__EVENTVALIDATION
、__VIEWSTATEGENERATOR
等。self.data['ctl00_ContentPlaceHolder2_panelGrid'] = 'ctl00$ContentPlaceHolder2$grdDirectory'
相关问题 更多 >
编程相关推荐