from bs4 import BeautifulSoup
def tag_visible(element):
if element == "Display Block":
return True
return False
def text_from_html(body):
soup = BeautifulSoup(body, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
with open("testhtml.html", "r") as open_html:
html = open_html.read()
print(text_from_html(html))
您可以尝试以下脚本。我应该只打印可见的网页文本从HTML。你知道吗
代码:
输入HTML文件:
输出:
相关问题 更多 >
编程相关推荐