从pythonwikipedia页面outpu检索主要段落

import urllib2 import re, sys from HTMLParser import HTMLParser class MLStripper(HTMLParser): def __init__(self): self.reset() self.fed = [] def handle_data(self, d): self.fed.append(d) def get_data(self): return ''.join(self.fed) def stripHTMLTags(html): html = re.sub(r'<{1}br{1}>', '\n', html) s = MLStripper() s.feed(html) text = s.get_data() if "External links" in text: text, sep, tail = text.partition('External links') if "External Links" in text: text, sep, tail = text.partition('External Links') text = text = text.replace("See also","\n\n See Also - \n") text = text.replace("*","- ") text = text.replace(".", ". ") text = text.replace(" "," ") text = text.replace(""" / / ""","") return text opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] infile = opener.open('http://en.wikipedia.org/w/index.php?title=Albert_Einstein&printable=yes') page = infile.read() print stripHTMLTags(page)

5条回答

网友

1楼 · 编辑于 2024-05-19 17:03:21

我在这里留下我的答案，因为这正是OP所要求的。正确的方法是按照下面的the answer by @ChristophD中的建议使用python-wikitools。在

我稍微修改了您问题中的代码以使用BeautifulSoup。还有其他选择。您可能还想试试lxml。在

import urllib2
import re, sys
from HTMLParser import HTMLParser

# EDIT 1: import the packag
from BeautifulSoup import BeautifulSoup

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def stripHTMLTags(html):
    html = re.sub(r'<{1}br{1}>', '\n', html)
    s = MLStripper()
    s.feed(html)
    text = s.get_data()
    if "External links" in text:
        text, sep, tail = text.partition('External links')
    if "External Links" in text:
        text, sep, tail = text.partition('External Links')
    text = text = text.replace("See also","\n\n See Also - \n")
    text = text.replace("*","- ")
    text = text.replace(".", ". ")
    text = text.replace("  "," ")
    text = text.replace("""   /
 / ""","")
    return text

opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
infile = opener.open('http://en.wikipedia.org/w/index.php?title=Albert_Einstein&printable=yes')
page = infile.read()

# EDIT 2: convert the page and extract text from the first <p> tag
soup = BeautifulSoup(page)
para = soup.findAll("p", limit=1)[0].text

print stripHTMLTags(para)

网友

2楼 · 编辑于 2024-05-19 17:03:21

我强烈建议不要对任何网站进行html抓取。在

这样做很痛苦，很容易崩溃，很多网站所有者不喜欢它。在

使用这个（python-wikitools）与wikipediaapi接口（从长远来看，这是您的最佳选择）。在

网友

3楼 · 编辑于 2024-05-19 17:03:21

以下API请求返回纯文本页提取： https://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles=Albert%20Einstein&explaintext

相关问题更多 >

编程相关推荐

热门问题

热门文章