如何处理BeautifulSoup网页无法加载的情况
目前,如果在获取网页时出现错误,soup 就不会填充网页内容,而是会得到 beautifulsoup 的默认返回值。
我想找一种方法来检查这个情况,这样如果获取网页时出错,我就可以跳过一段代码,比如
if soup:
do stuff
但我不想完全终止程序。抱歉我问的问题有点初级。
def getwebpage(address):
try:
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
req = urllib2.Request(address, None, headers)
web_handle = urllib2.urlopen(req)
except urllib2.HTTPError, e:
error_desc = BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code][0]
appendlog('HTTP Error: ' + str(e.code) + ': ' + address)
return
except urllib2.URLError, e:
appendlog('URL Error: ' + e.reason[1] + ': ' + address)
return
except:
appendlog('Unknown Error: ' + address)
return
return web_handle
def test():
soup = BeautifulSoup(getwebpage('http://doesnotexistblah.com/'))
print soup
if soup:
do stuff
test()
2 个回答
0
soup = getwebpage('http://doesnotexistblah.com/')
if soup is not None:
soup = BeautifulSoup(soup)
这就是你想要的吗?
3
把代码整理一下,让一个函数专门负责从网址获取数据,另一个函数则专门处理这些数据。
import urllib2, httplib
from BeautifulSoup import BeautifulSoup
def append_log(message):
print message
def get_web_page(address):
try:
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
request = urllib2.Request(address, None, headers)
response = urllib2.urlopen(request, timeout=20)
try:
return response.read()
finally:
response.close()
except urllib2.HTTPError as e:
error_desc = httplib.responses.get(e.code, '')
append_log('HTTP Error: ' + str(e.code) + ': ' +
error_desc + ': ' + address)
except urllib2.URLError as e:
append_log('URL Error: ' + e.reason[1] + ': ' + address)
except Exception as e:
append_log('Unknown Error: ' + str(e) + address)
def process_web_page(data):
if data is not None:
print BeautifulSoup(data)
else:
pass # do something else
data = get_web_page('http://doesnotexistblah.com/')
process_web_page(data)
data = get_web_page('http://docs.python.org/copyright.html')
process_web_page(data)