使用python urllib如何避免非HTML内容
我正在使用urllib(注意不是urllib2)来获取用户提供的网址的页面标题。可惜的是,有时候这个网址不是HTML页面,而是一些很大的文件或者远程网站上运行很久的过程。
我查看了Python的文档,但urllib的功能有限。看源代码的时候,我发现可以进行一些修改,但我无法在服务器上进行这些修改。文档中提到了info()这个方法,但没有给出具体的使用例子。
我在使用FancyURLopener,我猜这个在urllib2中是不可用的,我也不知道urllib2是否能解决这个问题。
- 有没有办法设置一个socket超时时间?
- 更重要的是,如何限制请求只获取HTML/XHTML类型的内容,完全忽略其他类型的内容?也就是说,我想确保不会下载整个内容。
我还在查看urllib的源代码,并研究urllib2,但我对这些工具并不是很熟悉。
1 个回答
7
在这里,文档提到 info()
方法可以返回与网址相关的一些元信息。你可以用这个方法来获取网页的头部信息,看看内容类型是什么(比如 text/html),如果不是你想要的类型,就可以放弃这个请求。
>>> import urllib
>>> d = urllib.urlopen('http://www.google.com/')
>>> try:
... if d.info()['content-type'].startswith('text/html'):
... print 'its html'
... else:
... print 'its not html'
... except KeyError:
... print 'its not html'
...
its html
我快速写了点东西,让你可以在 urllib
中指定一个 HEAD
请求。:)
import urllib
import socket
from urllib import unwrap, toBytes, quote, splittype, splithost, splituser, unquote, addinfourl
class MyURLOpener(urllib.FancyURLopener):
def open_http(self, url, data=None, method=None):
"""Use HTTP protocol."""
import httplib
user_passwd = None
proxy_passwd= None
if isinstance(url, str):
host, selector = splithost(url)
if host:
user_passwd, host = splituser(host)
host = unquote(host)
realhost = host
else:
host, selector = url
# check whether the proxy contains authorization information
proxy_passwd, host = splituser(host)
# now we proceed with the url we want to obtain
urltype, rest = splittype(selector)
url = rest
user_passwd = None
if urltype.lower() != 'http':
realhost = None
else:
realhost, rest = splithost(rest)
if realhost:
user_passwd, realhost = splituser(realhost)
if user_passwd:
selector = "%s://%s%s" % (urltype, realhost, rest)
if proxy_bypass(realhost):
host = realhost
#print "proxy via http:", host, selector
if not host: raise IOError, ('http error', 'no host given')
if proxy_passwd:
import base64
proxy_auth = base64.b64encode(proxy_passwd).strip()
else:
proxy_auth = None
if user_passwd:
import base64
auth = base64.b64encode(user_passwd).strip()
else:
auth = None
h = httplib.HTTP(host)
if method is not None:
h.putrequest(method, selector)
else:
h.putrequest('GET', selector)
if data is not None:
#h.putrequest('POST', selector)
h.putheader('Content-Type', 'application/x-www-form-urlencoded')
h.putheader('Content-Length', '%d' % len(data))
if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
if auth: h.putheader('Authorization', 'Basic %s' % auth)
if realhost: h.putheader('Host', realhost)
for args in self.addheaders: h.putheader(*args)
h.endheaders(data)
errcode, errmsg, headers = h.getreply()
fp = h.getfile()
if errcode == -1:
if fp: fp.close()
# something went wrong with the HTTP status line
raise IOError, ('http protocol error', 0,
'got a bad status line', None)
# According to RFC 2616, "2xx" code indicates that the client's
# request was successfully received, understood, and accepted.
if (200 <= errcode < 300):
return addinfourl(fp, headers, "http:" + url, errcode)
else:
if data is None:
return self.http_error(url, fp, errcode, errmsg, headers)
else:
return self.http_error(url, fp, errcode, errmsg, headers, data)
def open(self, fullurl, data=None, method=None):
"""Use URLopener().open(file) instead of open(file, 'r')."""
fullurl = unwrap(toBytes(fullurl))
# percent encode url, fixing lame server errors for e.g, like space
# within url paths.
fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
if self.tempcache and fullurl in self.tempcache:
filename, headers = self.tempcache[fullurl]
fp = open(filename, 'rb')
return addinfourl(fp, headers, fullurl)
urltype, url = splittype(fullurl)
if not urltype:
urltype = 'file'
if urltype in self.proxies:
proxy = self.proxies[urltype]
urltype, proxyhost = splittype(proxy)
host, selector = splithost(proxyhost)
url = (host, fullurl) # Signal special case to open_*()
else:
proxy = None
name = 'open_' + urltype
self.type = urltype
name = name.replace('-', '_')
if not hasattr(self, name):
if proxy:
return self.open_unknown_proxy(proxy, fullurl, data)
else:
return self.open_unknown(fullurl, data)
try:
return getattr(self, name)(url, data, method)
except socket.error, msg:
raise IOError, ('socket error', msg), sys.exc_info()[2]
opener = MyURLOpener()
# NOTE: including any data no longer implicitly makes the method POST,
# so you must now specify the method to POST if you include data
# NOTE: this overrides only open_http, and not open_https, but you can
# use a similar technique, and override open_https as well
d = opener.open('http://www.google.com/', method='HEAD')