使用Python向ASP.NET页面发起POST请求
我想从这个网站 "http://www.indiapost.gov.in/pin/" 上抓取邮政编码,我用下面的代码在尝试。
import urllib
import urllib2
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Origin': 'http://www.indiapost.gov.in',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17',
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': 'http://www.indiapost.gov.in/pin/',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'
}
viewstate = 'JulXDv576ZUXoVOwThQQj4bDuseXWDCZMP0tt+HYkdHOVPbx++G8yMISvTybsnQlNN76EX/...'
eventvalidation = '8xJw9GG8LMh6A/b6/jOWr970cQCHEj95/6ezvXAqkQ/C1At06MdFIy7+iyzh7813e1/3Elx...'
url = 'http://www.indiapost.gov.in/pin/'
formData = (
('__EVENTVALIDATION', eventvalidation),
('__EVENTTARGET',''),
('__EVENTARGUMENT',''),
('__VIEWSTATE', viewstate),
('__VIEWSTATEENCRYPTED',''),
('__EVENTVALIDATION', eventvalidation),
('txt_offname',''),
('ddl_dist','0'),
('txt_dist_on',''),
('ddl_state','2'),
('btn_state','Search'),
('txt_stateon',''),
('hdn_tabchoice','3')
)
from urllib import FancyURLopener
class MyOpener(FancyURLopener):
version = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'
myopener = MyOpener()
encodedFields = urllib.urlencode(formData)
f = myopener.open(url, encodedFields)
print f.info()
try:
fout = open('tmp.txt', 'w')
except:
print('Could not open output file\n')
fout.writelines(f.readlines())
fout.close()
但是我从服务器得到的回复是:“抱歉,这个网站遇到了严重问题,请尝试重新加载页面或联系网站管理员。”请告诉我我哪里出错了。
1 个回答
19
你是从哪里得到 viewstate
和 eventvalidation
的值的?首先,它们的值不应该以“...”结尾,你可能漏掉了什么。其次,它们不应该是硬编码的,也就是说不应该直接写死在代码里。
有一个解决方案是这样的:
- 通过网址 "http://www.indiapost.gov.in/pin/" 获取页面,不带任何表单数据。
- 解析页面,提取像
__VIEWSTATE
和__EVENTVALIDATION
这样的表单值(你可以使用 BeautifulSoup 这个工具)。 - 通过在第二步中添加重要的表单数据,进行第二次HTTP请求以获取搜索结果。
更新:
根据上面的思路,我稍微修改了你的代码,让它可以工作:
import urllib
from bs4 import BeautifulSoup
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Origin': 'http://www.indiapost.gov.in',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17',
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': 'http://www.indiapost.gov.in/pin/',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'
}
class MyOpener(urllib.FancyURLopener):
version = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'
myopener = MyOpener()
url = 'http://www.indiapost.gov.in/pin/'
# first HTTP request without form data
f = myopener.open(url)
soup = BeautifulSoup(f)
# parse and retrieve two vital form values
viewstate = soup.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
formData = (
('__EVENTVALIDATION', eventvalidation),
('__VIEWSTATE', viewstate),
('__VIEWSTATEENCRYPTED',''),
('txt_offname', ''),
('ddl_dist', '0'),
('txt_dist_on', ''),
('ddl_state','1'),
('btn_state', 'Search'),
('txt_stateon', ''),
('hdn_tabchoice', '1'),
('search_on', 'Search'),
)
encodedFields = urllib.urlencode(formData)
# second HTTP request with form data
f = myopener.open(url, encodedFields)
try:
# actually we'd better use BeautifulSoup once again to
# retrieve results(instead of writing out the whole HTML file)
# Besides, since the result is split into multipages,
# we need send more HTTP requests
fout = open('tmp.html', 'w')
except:
print('Could not open output file\n')
fout.writelines(f.readlines())
fout.close()