soup = BeautifulSoup.BeautifulSoup(html)
for child in soup.recursiveChildGenerator():
name = getattr(child, "name", None)
if name is not None:
print name
elif not child.isspace(): # leaf node, don't print spaces
print child
def recursiveChildren(x):
if "childGenerator" in dir(x):
for child in x.childGenerator():
name = getattr(child, "name", None)
if name is not None:
print "[Container Node]",child.name
recursiveChildren(child)
else:
if not x.isspace(): #Just to avoid printing "\n" parsed from document.
print "[Terminal Node]",x
if __name__ == "__main__":
soup = BeautifulSoup(your_data)
for child in soup.childGenerator():
recursiveChildren(child)
使用"childGenerator" in dir(x)我们确保元素是容器,终端节点(如NavigableStrings)不是容器,也不包含子节点。
例如HTML:
<html>
<ul>
<li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
<li>Aliquam tincidunt mauris eu risus.</li>
<li>Vestibulum auctor dapibus neque.</li>
</ul>
</html>
这些脚本打印。。。
[Container Node] ul
[Container Node] li
[Terminal Node] Lorem ipsum dolor sit amet, consectetuer adipiscing elit.
[Container Node] li
[Terminal Node] Aliquam tincidunt mauris eu risus.
[Container Node] li
[Terminal Node] Vestibulum auctor dapibus neque.
recursiveChildGenerator()
已经做到了:输出
对于来自@msalvadores's answer的html:
注意:
html
由于the example包含两个打开的<html>
标记而打印两次。我认为您可以使用“childGenerator”方法并递归地使用这个方法以DFT的方式解析树。
使用
"childGenerator" in dir(x)
我们确保元素是容器,终端节点(如NavigableStrings
)不是容器,也不包含子节点。例如HTML:
这些脚本打印。。。
相关问题 更多 >
编程相关推荐