Python错误：lxml+tidylib/我想解决grabled ch

#!/usr/bin/env python # -*- coding: utf-8 -*- import cchardet, urllib2 import lxml.html from tidylib import tidy_document class ParseHTML(object): def __init__(self, html): self.charset = cchardet.detect(html)['encoding'] self.html = html self.html = self.html.decode(self.charset) # lineA self.document, self.errors = tidy_document(self.html) self.dom = lxml.html.fromstring(self.document) def getTitle(self): self.title = self.dom.xpath('//title')[0].text return self.title.strip() url = r'http://www.asahi.com/articles/ASG2B5T0ZG2BUHBI131.html?iref=comtop_6_01' response = urllib2.urlopen(url) html = response.read() parse = ParseHTML(html) title = parse.getTitle() print title

1条回答

网友

1楼 · 发布于 2024-05-15 04:44:55

仅使用lxml

>>> from lxml import etree
>>> with open('out.html', 'w') as f:
...     root = etree.parse(url, etree.HTMLParser())
...     title = root.xpath('//title/text()')[0]
...     f.write(title.encode("utf-8"))

正在加载输出.html在浏览器中显示：

中国・台湾、窓口役の担当閣僚が会談　４９年の分断後初：朝日新聞デジタル

相关问题更多 >

编程相关推荐

热门问题

热门文章