无法使用Python urllib2加载ASP.NET页面
我正在尝试向这个网址 https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx 发起一个POST请求,以便抓取数据。
这是我现在的代码:
from urllib import urlencode
import urllib2
# Configuration
uri = 'https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx'
headers = {
'HTTP_USER_AGENT': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.13) Gecko/2009073022 Firefox/3.0.13',
'HTTP_ACCEPT': 'application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Content-Type': 'application/x-www-form-urlencoded'
}
formFields = [(r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber','003-00013'),
(r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberTB','003-00013'),
(r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberSIS','0'),
(r'ctl00%24MainContent%24WellDetailsCriteria1%24ViewDataButton','View Data'),
(r'__VIEWSTATE', r'/wEPDwUJOTc2MzI0NTk4D2QWAmYPDxYEHglQYWdlVGl0bGUFDFdlbGwgRGV0YWlscx4SUmVxdWlyZXNKYXZhU2NyaXB0Z2QWAgIDD2QWCGYPFgIeBFRleHQF1hA8ZGl2IHN0eWxlPSJoZWlnaHQ6IDE0OXB4OyB3aWR0aDogOTUycHg7IGJhY2tncm91bmQtcmVwZWF0OiBuby1yZXBlYXQ7IGJhY2tncm91bmQtaW1hZ2U6dXJsKGh0dHBzOi8vd3d3LmFoczIuZGVwLnN0YXRlLnBhLnVzL2ltYWdlcy9kZXBfZXh0ZXJuYWxfb ... YWRlciRIZWFkZXJWaWV3D2dkrp784OTosLLEOFxy/mWBtsit I6kjKRlZ/ 1IBCkZNk='),
(r'__EVENTVALIDATION', r'/wEWBALn79faCwK+qZJIAqXY04cBAorCkdMKL5VEAnd1IIQ3cnIHRxZAluFo5G5Y5ffyRXRdtmBiGCc='),
(r'__EVENTTARGET', r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber'),
(r'__EVENTARGUMENT', r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber')
]
# Load page
encodedFields = urlencode(formFields)
req = urllib2.Request(uri, encodedFields, headers)
r = urllib2.urlopen(req)
# Handle results
print r.read()
但是返回的页面显示“抱歉,我们遇到了技术问题。请稍后再试”,所以我知道我一定哪里搞错了。我没有发送cookie,但我不确定这是否必要。如果需要的话,我可以直接在我的请求头中加上“Cookie:ASP.NET_SessionId=whatever”吗?还是说我需要使用CookieLib这个库?
如果你们能告诉我哪里出问题了,我会非常感激!
编辑:
这是更新后的代码,它直接从页面获取__VIEWSTATE和__EVENTVALIDATION的信息(这样我就不需要手动复制粘贴,也不用担心它过期了)。
from urllib import urlencode
import urllib2
from BeautifulSoup import BeautifulSoup
import cookielib
# Configuration
uri = 'https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx'
# Create headers
headers = {
'Accept': 'application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Host': 'www.paoilandgasreporting.state.pa.us',
'Origin': 'https://www.paoilandgasreporting.state.pa.us',
'Referer': 'https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx',
'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.134 Safari/534.16',
}
# Set up cookie jar
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), urllib2.HTTPSHandler(debuglevel=1))
# Grab information that we need to pass along with our requests
#r = urllib2.urlopen(uri)
req = urllib2.Request(uri,urlencode([]),headers)
cj.add_cookie_header(req)
r = opener.open(req)
print cj
soup = BeautifulSoup(r.read())
eventvalidation = soup.find('input', id='__EVENTVALIDATION')['value']
viewstate = soup.find('input', id='__VIEWSTATE')['value']
formFields = [ ('__EVENTVALIDATION',eventvalidation),
('__VIEWSTATE',viewstate),
('__EVENTTARGET', ''),
('__EVENTARGUMENT', ''),
('ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber', '003-00013'),
('ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberTB','003-00013'),
('ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberSIS','0'), # TODO what value to pass?
('ctl00$MainContent$WellDetailsCriteria1$ViewDataButton','View Data'), # do we need this?
]
# Load page
encodedFields = urlencode(formFields)
req = urllib2.Request(uri, encodedFields, headers)
cj.add_cookie_header(req)
r = opener.open(req)
# Handle results
print r.read()
2 个回答
0
你提交的数据可能和他们期待的格式不一样,这就是导致错误的原因。你可以查看一下你的浏览器实际提交了什么数据,然后在你的脚本中模仿这个过程。有一些Firefox的插件可以帮助你做到这一点,比如TamperData、Firebug和LiveHttp。
不过,最简单的办法可能是使用mechanize这个工具。
1
我也遇到过同样的问题。我找到的唯一解决办法(虽然可能不是最好的)是这样做:在“第一次访问为真”的时候解析数据,然后用这些值提交你的表单。
for result in the_page.findAll('input', attrs={'name' : '__VIEWSTATE'}):
view_state = result['value']
for result_1 in the_page.findAll('input', attrs={'name' : '__EVENTVALIDATION'}):
event_validation = result_1['value']
for result_2 in the_page.findAll('input', attrs={'name' : '__PREVIOUSPAGE'}):
previous_page = result_2['value']
for result in the_page.findAll('input', attrs={'name' : '__EVENTTARGET'}):
event_target = result['value']
然后再:
url = 'http://bandscore.ielts.org/search.aspx'
values = {
'__EVENTTARGET' : 'gdvSearchResults',
'__EVENTARGUMENT' : page,
'__VIEWSTATE' : view_state,
'__PREVIOUSPAGE' : previous_page,
'__EVENTVALIDATION' : event_validation,
'DropDownList1' : Country,
#'txtSearchInstitution' : '',
#'hdnSearchText' : '',
#'rdoFilter': '%25',
}
user_agent = 'Mozilla/5 (Solaris 10) Gecko'
headers = { 'User-Agent' : user_agent }
data = urllib.urlencode(values)
req = urllib2.Request(url, data, headers)
response = urllib2.urlopen(req)
thePage = response.read()
the_page = soup(thePage)
我在Python方面还是个新手,但这个方法对我有效……所以如果能帮到你就太好了!