Python 解析大 XML 时 lxml 的内存使用情况
我是一名刚接触Python的新手。我正在尝试在我的Python模块中使用lxml来解析一个很大的XML文件。尽管我在每次循环结束时都清理了元素,但我的内存使用量还是飙升,导致应用程序崩溃。我确信我在这里遗漏了一些东西。请帮我找出问题所在。
以下是我使用的主要函数 -
from lxml import etree
def parseXml(context,attribList):
for _, element in context:
fieldMap={}
rowList=[]
readAttribs(element,fieldMap,attribList)
readAllChildren(element,fieldMap,attribList)
for row in rowList:
yield row
element.clear()
def readAttribs(element,fieldMap,attribList):
for atrrib in attribList:
fieldMap[attrib]=element.get(attrib,'')
def readAllChildren(element,fieldMap,attribList,rowList):
for childElem in element:
readAttribs(childEleme,fieldMap,attribList)
if len(childElem) > 0:
readAllChildren(childElem,fieldMap,attribList)
rowlist.append(fieldMap.copy())
childElem.clear()
def main():
attribList=['name','age','id']
context=etree.iterparse(fullFilePath, events=("start",))
for row in parseXml(context,attribList)
print row
谢谢!!
下面是示例XML和嵌套字典 -
<root xmlns='NS'>
<Employee Name="Mr.ZZ" Age="30">
<Experience TotalYears="10" StartDate="2000-01-01" EndDate="2010-12-12">
<Employment id = "1" EndTime="ABC" StartDate="2000-01-01" EndDate="2002-12-12">
<Project Name="ABC_1" Team="4">
</Project>
</Employment>
<Employment id = "2" EndTime="XYZ" StartDate="2003-01-01" EndDate="2010-12-12">
<PromotionStatus>Manager</PromotionStatus>
<Project Name="XYZ_1" Team="7">
<Award>Star Team Member</Award>
</Project>
</Employment>
</Experience>
</Employee>
</root>
ELEMENT_NAME='element_name'
ELEMENTS='elements'
ATTRIBUTES='attributes'
TEXT='text'
xmlDef={ 'namespace' : 'NS',
'content' :
{ ELEMENT_NAME: 'Employee',
ELEMENTS: [{ELEMENT_NAME: 'Experience',
ELEMENTS: [{ELEMENT_NAME: 'Employment',
ELEMENTS: [{
ELEMENT_NAME: 'PromotionStatus',
ELEMENTS: [],
ATTRIBUTES:[],
TEXT:['PromotionStatus']
},
{
ELEMENT_NAME: 'Project',
ELEMENTS: [{
ELEMENT_NAME: 'Award',
ELEMENTS: {},
ATTRIBUTES:[],
TEXT:['Award']
}],
ATTRIBUTES:['Name','Team'],
TEXT:[]
}],
ATTRIBUTES: ['TotalYears','StartDate','EndDate'],
TEXT:[]
}],
ATTRIBUTES: ['TotalYears','StartDate','EndDate'],
TEXT:[]
}],
ATTRIBUTES: ['Name','Age'],
TEXT:[]
}
}
1 个回答
欢迎来到Python和Stack Overflow!
看起来你在查看lxml
和特别是etree.iterparse(..)
时,听取了一些不错的建议,但我觉得你的实现方式有点偏离了方向。iterparse(..)
的主要目的是不去收集和存储数据,而是实时处理读取到的标签。你的readAllChildren(..)
函数把所有内容都保存到了rowList
里,这样它就会不断增长,覆盖整个文档树。我做了一些修改来展示发生了什么:
from lxml import etree
def parseXml(context,attribList):
for event, element in context:
print "%s element %s:" % (event, element)
fieldMap = {}
rowList = []
readAttribs(element, fieldMap, attribList)
readAllChildren(element, fieldMap, attribList, rowList)
for row in rowList:
yield row
element.clear()
def readAttribs(element, fieldMap, attribList):
for attrib in attribList:
fieldMap[attrib] = element.get(attrib,'')
print "fieldMap:", fieldMap
def readAllChildren(element, fieldMap, attribList, rowList):
for childElem in element:
print "Found child:", childElem
readAttribs(childElem, fieldMap, attribList)
if len(childElem) > 0:
readAllChildren(childElem, fieldMap, attribList, rowList)
rowList.append(fieldMap.copy())
print "len(rowList) =", len(rowList)
childElem.clear()
def process_xml_original(xml_file):
attribList=['name','age','id']
context=etree.iterparse(xml_file, events=("start",))
for row in parseXml(context,attribList):
print "Row:", row
用一些虚拟数据运行:
>>> from cStringIO import StringIO
>>> test_xml = """\
... <family>
... <person name="somebody" id="5" />
... <person age="45" />
... <person name="Grandma" age="62">
... <child age="35" id="10" name="Mom">
... <grandchild age="7 and 3/4" />
... <grandchild id="12345" />
... </child>
... </person>
... <something-completely-different />
... </family>
... """
>>> process_xml_original(StringIO(test_xml))
start element: <Element family at 0x105ca58>
fieldMap: {'age': '', 'name': '', 'id': ''}
Found child: <Element person at 0x105ca80>
fieldMap: {'age': '', 'name': 'somebody', 'id': '5'}
len(rowList) = 1
Found child: <Element person at 0x105c468>
fieldMap: {'age': '45', 'name': '', 'id': ''}
len(rowList) = 2
Found child: <Element person at 0x105c7b0>
fieldMap: {'age': '62', 'name': 'Grandma', 'id': ''}
Found child: <Element child at 0x106e468>
fieldMap: {'age': '35', 'name': 'Mom', 'id': '10'}
Found child: <Element grandchild at 0x106e148>
fieldMap: {'age': '7 and 3/4', 'name': '', 'id': ''}
len(rowList) = 3
Found child: <Element grandchild at 0x106e490>
fieldMap: {'age': '', 'name': '', 'id': '12345'}
len(rowList) = 4
len(rowList) = 5
len(rowList) = 6
Found child: <Element something-completely-different at 0x106e4b8>
fieldMap: {'age': '', 'name': '', 'id': ''}
len(rowList) = 7
Row: {'age': '', 'name': 'somebody', 'id': '5'}
Row: {'age': '45', 'name': '', 'id': ''}
Row: {'age': '7 and 3/4', 'name': '', 'id': ''}
Row: {'age': '', 'name': '', 'id': '12345'}
Row: {'age': '', 'name': '', 'id': '12345'}
Row: {'age': '', 'name': '', 'id': '12345'}
Row: {'age': '', 'name': '', 'id': ''}
start element: <Element person at 0x105ca80>
fieldMap: {'age': '', 'name': '', 'id': ''}
start element: <Element person at 0x105c468>
fieldMap: {'age': '', 'name': '', 'id': ''}
start element: <Element person at 0x105c7b0>
fieldMap: {'age': '', 'name': '', 'id': ''}
start element: <Element child at 0x106e468>
fieldMap: {'age': '', 'name': '', 'id': ''}
start element: <Element grandchild at 0x106e148>
fieldMap: {'age': '', 'name': '', 'id': ''}
start element: <Element grandchild at 0x106e490>
fieldMap: {'age': '', 'name': '', 'id': ''}
start element: <Element something-completely-different at 0x106e4b8>
fieldMap: {'age': '', 'name': '', 'id': ''}
虽然有点难以阅读,但你可以看到它从根标签开始,逐层向下遍历整个树,为文档中的每个元素构建rowList
。你还会注意到,它甚至没有停下来,因为element.clear()
的调用是在parseXml(..)
中的yield
语句之后,所以它要等到第二次循环(即树中的下一个元素)才会执行。
增量处理真棒
一个简单的解决办法是让iterparse(..)
发挥作用:进行迭代解析!下面的代码会提取相同的信息,并以增量的方式处理:
def do_something_with_data(data):
"""This just prints it out. Yours will probably be more interesting."""
print "Got data: ", data
def process_xml_iterative(xml_file):
# by using the default 'end' event, you start at the _bottom_ of the tree
ATTRS = ('name', 'age', 'id')
for event, element in etree.iterparse(xml_file):
print "%s element: %s" % (event, element)
data = {}
for attr in ATTRS:
data[attr] = element.get(attr, u"")
do_something_with_data(data)
element.clear()
del element # for extra insurance
在相同的虚拟XML上运行:
>>> print test_xml
<family>
<person name="somebody" id="5" />
<person age="45" />
<person name="Grandma" age="62">
<child age="35" id="10" name="Mom">
<grandchild age="7 and 3/4" />
<grandchild id="12345" />
</child>
</person>
<something-completely-different />
</family>
>>> process_xml_iterative(StringIO(test_xml))
end element: <Element person at 0x105cc10>
Got data: {'age': u'', 'name': 'somebody', 'id': '5'}
end element: <Element person at 0x106e468>
Got data: {'age': '45', 'name': u'', 'id': u''}
end element: <Element grandchild at 0x106e148>
Got data: {'age': '7 and 3/4', 'name': u'', 'id': u''}
end element: <Element grandchild at 0x106e490>
Got data: {'age': u'', 'name': u'', 'id': '12345'}
end element: <Element child at 0x106e508>
Got data: {'age': '35', 'name': 'Mom', 'id': '10'}
end element: <Element person at 0x106e530>
Got data: {'age': '62', 'name': 'Grandma', 'id': u''}
end element: <Element something-completely-different at 0x106e558>
Got data: {'age': u'', 'name': u'', 'id': u''}
end element: <Element family at 0x105c6e8>
Got data: {'age': u'', 'name': u'', 'id': u''}
这样应该会大大提高你脚本的速度和内存性能。此外,通过挂钩'end'
事件,你可以在处理过程中随时清除和删除元素,而不必等到所有子元素都处理完。
根据你的数据集,可能只处理某些类型的元素会更好。比如,根元素可能没有太大意义,其他嵌套元素也可能会让你的数据集充满很多{'age': u'', 'id': u'', 'name': u''}
这样的内容。
或者,使用SAX
顺便提一下,当我看到“XML”和“低内存”时,我的脑海中总是会想到SAX,这也是解决这个问题的另一种方法。使用内置的xml.sax
模块:
import xml.sax
class AttributeGrabber(xml.sax.handler.ContentHandler):
"""SAX Handler which will store selected attribute values."""
def __init__(self, target_attrs=()):
self.target_attrs = target_attrs
def startElement(self, name, attrs):
print "Found element: ", name
data = {}
for target_attr in self.target_attrs:
data[target_attr] = attrs.get(target_attr, u"")
# (no xml trees or elements created at all)
do_something_with_data(data)
def process_xml_sax(xml_file):
grabber = AttributeGrabber(target_attrs=('name', 'age', 'id'))
xml.sax.parse(xml_file, grabber)
你需要根据你的情况评估这两种选择哪个更适合(如果你经常需要这样做,可能还要跑几次基准测试)。
记得跟进一下结果如何哦!
根据后续评论的编辑
实现上述任一解决方案可能需要对你代码的整体结构进行一些调整,但你现有的代码应该还是可以做到的。例如,如果你想批量处理“行”,你可以这样:
def process_xml_batch(xml_file, batch_size=10):
ATTRS = ('name', 'age', 'id')
batch = []
for event, element in etree.iterparse(xml_file):
data = {}
for attr in ATTRS:
data[attr] = element.get(attr, u"")
batch.append(data)
element.clear()
del element
if len(batch) == batch_size:
do_something_with_batch(batch)
# Or, if you want this to be a genrator:
# yield batch
batch = []
if batch:
# there are leftover items
do_something_with_batch(batch) # Or, yield batch