我正在尝试用scrapy刮网站。该网站有三个下拉菜单,网站还使用__VIEWSTATE
。我可以提取第一个下拉列表('dcode')的值,但无法提取第二个下拉列表('blk')的选项。你知道吗
我不明白为什么我的代码没有进入parse_blk
函数??你知道吗
我犯了个错误
Traceback (most recent call last):
File "c:\wpy-3670\python-3.6.7.amd64\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
yield next(it)
File "c:\wpy-3670\python-3.6.7.amd64\lib\site-packages\scrapy\core\spidermw.py", line 84, in evaluate_iterable
for r in iterable:
File "c:\wpy-3670\python-3.6.7.amd64\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "c:\wpy-3670\python-3.6.7.amd64\lib\site-packages\scrapy\core\spidermw.py", line 84, in evaluate_iterable
for r in iterable:
File "c:\wpy-3670\python-3.6.7.amd64\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "c:\wpy-3670\python-3.6.7.amd64\lib\site-packages\scrapy\core\spidermw.py", line 84, in evaluate_iterable
for r in iterable:
File "c:\wpy-3670\python-3.6.7.amd64\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\wpy-3670\python-3.6.7.amd64\lib\site-packages\scrapy\core\spidermw.py", line 84, in evaluate_iterable
for r in iterable:
File "c:\wpy-3670\python-3.6.7.amd64\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\WPy-3670\uplist\uplist\spiders\test1.py", line 65, in parse_blk
dont_filter=True
File "c:\wpy-3670\python-3.6.7.amd64\lib\site-packages\scrapy\http\request\form.py", line 49, in from_response
form = _get_form(response, formname, formid, formnumber, formxpath)
File "c:\wpy-3670\python-3.6.7.amd64\lib\site-packages\scrapy\http\request\form.py", line 84, in _get_form
raise ValueError("No <form> element found in %s" % response)
ValueError: No <form> element found in <200 http://sec.up.nic.in/site/PRIVoterSearch2015.aspx>
到目前为止我的代码
import scrapy,re
from scrapy.item import Item
#from scrapy.shell import inspect_response
class blkname(Item):
text = scrapy.Field()
class test1(scrapy.Spider):
name = "test1"
allowed_domains = ["sec.up.nic.in"]
start_urls = ["http://sec.up.nic.in/site/PRIVoterSearch2015.aspx"]
def parse(self, response):
for dcode in response.css('select#dcode > option ::attr(value)').extract():
#print( response.css('input#__VIEWSTATEGENERATOR::attr(value)').extract_first())
#print(response.css('input#__VIEWSTATE::attr(value)').extract_first())
#print(dcode)
yield scrapy.FormRequest.from_response(
response,
headers={'user-agent': 'Mozilla/5.0'},
formdata={
'dcode': dcode,
'__VIEWSTATE': response.css('input#__VIEWSTATE::attr(value)').extract_first(),
'__EVENTTARGET': 'dcode',
'__ASYNCPOST': 'true',
},
callback=self.parse_blk,
dont_filter=True
)
def parse_blk(self, response):
for blk in response.css('select#blk > option ::attr(value)').extract():
#block = response.css('select#blk > option ::attr(value)').extract()
#print(block)
#print(response.css('hiddenField|__VIEWSTATE::attr(value)').extract_first())
#data = re.findall("__VIEWSTATE| =(.+?);|", response.body.decode("utf-8"), re.S)
data = re.findall("(?<=__VIEWSTATE).*$", response.body.decode("utf-8"), re.S)
#print(data)
#print(block)
viewstate = str(data).split('|')[1]
#print (viewstate)
yield scrapy.FormRequest.from_response(
response,
headers={'user-agent': 'Mozilla/5.0'},
formdata={
'dcode':response.css('select#dcode > option[selected] ::attr(value)').extract_first(),
'blk': blk,
'__VIEWSTATE': viewstate,
'__EVENTTARGET': 'blk',
'__ASYNCPOST': 'true',
},
callback=self.parse_gp,
dont_filter=True
)
def parse_gp(self, response):
for gp in response.css('select#gp > option ::attr(value)').extract():
print(gp)
请帮助我,因为我是一个初学者在这个领域,我没有深入的知识,网页刮。你知道吗
我从youtube https://www.youtube.com/watch?v=ve_0h4Y8nuI&list=PLhTjy8cBISEqkN-5Ku_kXG4QW33sxQo0t
学到了关于scrapy的知识,并在https://blog.scrapinghub.com/2016/04/20/scrapy-tips-from-the-pros-april-2016-edition
的基础上编写了代码
parse()应该是
for more detail
相关问题 更多 >
编程相关推荐