查找/替换XML内容

Question

我之前用xml.etree.ElementTree成功解析了一个xml文件，找到了里面的内容，然后把这些内容写入另一个xml文件。不过，那时候我只处理了单个标签里的文本。

import os, sys, glob, xml.etree.ElementTree as ET
path = r"G:\\63D RRC GIS Data\\metadata\\general\\2010_contract"
for fn in os.listdir(path):
    filepaths = glob.glob(path + os.sep + fn + os.sep + "*overall.xml")
    for filepath in filepaths:
        (pa, filename) = os.path.split(filepath)
        ####use this section to grab element text from old, archived metadata files; this text then gets put into current, working .xml###
        root = ET.parse(pa + os.sep + "archive" + os.sep + "base_metadata_overall.xml").getroot()

        iterator = root.getiterator()
        for item in iterator:
            if item.tag == "abstract":
                correct_abstract = item.text

        root2 = ET.parse(pa + os.sep + "base_metadata_overall.xml").getroot()

        iterator2 = root2.getiterator("descript")
        for item in iterator2:
            if item.tag == "abstract":
                old_abstract = item.find("abstract")
                old_abstract_text = old_abstract.text
                item.remove(old_abstract)
                new_symbol_abstract = ET.SubElement(item, "title")
                new_symbol_abstract.text = correct_abstract                
        tree = ET.ElementTree(root2)
        tree.write(pa + os.sep + "base_metadata_overall.xml")
        print "created --- " + filename + " metadata"

但现在，我需要：

1) 在一个xml文件中查找并提取所有“attr”标签之间的内容，下面是一个例子：

<attr><attrlabl Sync="TRUE">OBJECTID</attrlabl><attalias Sync="TRUE">ObjectIdentifier</attalias><attrtype Sync="TRUE">OID</attrtype><attwidth Sync="TRUE">4</attwidth><atprecis Sync="TRUE">0</atprecis><attscale Sync="TRUE">0</attscale><attrdef Sync="TRUE">Internal feature number.</attrdef></attr>

2) 然后，我需要打开另一个xml文件，查找同样的“attr”标签之间的所有内容，并用上面提取的内容替换掉它。

基本上，我之前做的事情，只不过这次要忽略“attr”标签之间的子元素、属性等，把它们当作文本来处理。

谢谢大家！！

请多多包涵，这个论坛的发帖方式跟我之前用的不太一样！

这是我目前的进展：

import os, sys, glob, re, xml.etree.ElementTree as ET
from lxml import etree

path = r"C:\\temp\\python\\xml"
for fn in os.listdir(path):
    filepaths = glob.glob(path + os.sep + fn + os.sep +  "*overall.xml")
    for filepath in filepaths:
            (pa, filename) = os.path.split(filepath)

            xml = open(pa + os.sep + "attributes.xml")
            xmltext = xml.read()
            correct_attrs = re.findall("<attr> (.*?)</attr>",xmltext,re.DOTALL)
            for item in correct_attrs:
                correct_attribute = "<attr>" + item + "</attr>"

                xml2 = open(pa + os.sep + "base_metadata_overall.xml")
                xmltext2 = xml2.read()
                old_attrs = re.findall("<attr>(.*?)</attr>",xmltext,re.DOTALL)
                for item2 in old_attrs:
                    old_attribute = "<attr>" + item + "</attr>"               



                    old = etree.fromstring(old_attribute)
                    replacement = new.xpath('//attr')
                    for attr in old.xpath('//attr'):
                        attr.getparent().replace(attr, copy.deepcopy(replacement))
                        print lxml.etree.tostring(old)

我已经让这个工作正常运行了，见下面的代码，甚至还搞定了如何导出到新的.xml文件。不过，如果源文件和目标文件中的“attr”数量不一样，我会遇到以下错误，有什么建议吗？

node = replacements.pop()

IndexError: pop from empty list

import os, sys, glob, re, copy, lxml, xml.etree.ElementTree as ET
from lxml import etree
path = r"C:\\temp\\python\\xml"
for fn in os.listdir(path):
filepaths = glob.glob(path + os.sep + fn + os.sep + "*overall.xml")
for filepath in filepaths:
        xmlatributes = open(pa + os.sep + "attributes.xml")
        xmlatributes_txt = xmlatributes.read()
        xmltarget = open(pa + os.sep + "base_metadata_overall.xml")
        xmltarget_txt = xmltarget.read()
        source = lxml.etree.fromstring(xmlatributes_txt)
        dest = lxml.etree.fromstring(xmltarget_txt)            




        replacements = source.xpath('//attr')
        replacements.reverse()


        for attr in dest.xpath('//attr'):
            node = replacements.pop()
            attr.getparent().replace(attr, copy.deepcopy(node))
        #print lxml.etree.tostring(dest)
        tree = ET.ElementTree(dest)
        tree.write (pa + os.sep + "edited_metadata.xml")
        print fn + "--- sucessfully edited"

更新 2011年5月16日

我重新调整了一些东西，以修复上面提到的“IndexError: pop from empty list”错误。我意识到“attr”标签的替换不一定是1对1的。例如，有时候源.xml文件有20个“attr”，而目标.xml文件有25个“attr”。在这种情况下，1对1的替换就会出问题。

无论如何，下面的代码会先移除所有的“attr”，然后用源文件中的“attr”替换掉它们。它还会检查另一个标签“subtype”，如果存在的话，会把它们添加到“attr”后面，但在“detailed”标签内部。

再次感谢所有帮助过我的人。

import os, sys, glob, re, copy, lxml, xml.etree.ElementTree as ET
from lxml import etree
path = r"G:\\63D RRC GIS Data\\metadata\\general\\2010_contract"
#path = r"C:\\temp\python\\xml"
for fn in os.listdir(path):
    correct_title = fn.replace ('_', ' ') + " various facilities"
    correct_fc_name = fn.replace ('_', ' ')
    filepaths = glob.glob(path + os.sep + fn + os.sep + "*overall.xml")
    for filepath in filepaths:
        print "-----" + fn + "-----"
        (pa, filename) = os.path.split(filepath)
        xmlatributes = open(pa + os.sep + "attributes.xml")
        xmlatributes_txt = xmlatributes.read()
        xmltarget = open(pa + os.sep + "base_metadata_overall.xml")
        xmltarget_txt = xmltarget.read()
        source = lxml.etree.fromstring(xmlatributes_txt)
        dest = lxml.etree.fromstring(xmltarget_txt)
        replacements = source.xpath('//attr')
        replacesubtypes = source.xpath('//subtype')
        subtype_true_f = len(replacesubtypes)

        attrtag = dest.xpath('//attr')
        #print len(attrtag)
        num_realatrs = len(replacements)
        for n in attrtag:
            n.getparent().remove(n)
        print n.tag + " removed"

        detailedtag = dest.xpath('//detailed')
        for n2 in detailedtag:
            pos = 0
            for realatrs in replacements:
                n2.insert(pos + 1, realatrs)
            print "attr's replaced"
            if subtype_true_f >= 1:
                #print subtype_true_f
                for realsubtypes in replacesubtypes:
                   n2.insert(num_realatrs + 1, realsubtypes)
                print "subtype's replaced"

        tree = ET.ElementTree(dest)
        tree.write (pa + os.sep + "base_metadata_overall_v2.xml")
        print fn + "--- sucessfully edited"

错误处理文件操作查找替换数据转换 xml解析标签处理内容提取子元素管理

查找/替换XML内容

2 个回答

撰写回答