如何使用python minidom 从XML中提取数据

1 投票
3 回答
3578 浏览
提问于 2025-04-17 01:45

给定这个XML文件,我想从中提取数据。不过,我在提取从 <LandmarkPointListXml> 开始的数据时遇到了困难。

这个XML文件:

  <?xml version="1.0" encoding="utf-8"?>
  <Map xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
  <MapName>er</MapName>
  <MapURL>er.gif</MapURL>
  <Name>er</Name>
  <URL>er.gif</URL>
  <LandmarkPointListXml>
    <anyType xsi:type="LandmarkPointProperty">
      <LandmarkPointX>400</LandmarkPointX>
      <LandmarkPointY>292</LandmarkPointY>
      <LandmarkDesc>my room door</LandmarkDesc>
    </anyType>
    <anyType xsi:type="LandmarkPointProperty">
      <LandmarkPointX>399</LandmarkPointX>
      <LandmarkPointY>219</LandmarkPointY>
      <LandmarkDesc>bro room door</LandmarkDesc>
    </anyType>
  </LandmarkPointListXml>
  <RegionPointListXml />
</Map>

Python程序:

    def GetMapData(self):
        result = ""
        haha = self.XMLdoc.firstChild #root node
        for child in haha.childNodes:
            if (cmp(child.nodeName,'LandmarkPointListXml')==0):
                result = result + '|' + self.loopLandmark(child.childNodes) + '|'
            else:
                result = result + child.firstChild.nodeValue + ','
        return result

    def loopLandmark(self, landmarks):
        result=""
        haha=landmarks.getElementsByTagName('anyType')
        for child in haha.childNodes:
            if (cmp(haha.firstChild.nodeName,'LandmarkPointX') == 0):
                result=result+child.firstChild.nodeValue+','
                ChildNode = ChildNode.nextSibling
                result=result+child.firstChild.nodeValue+','
                ChildNode = ChildNode.nextSibling
                result=result+child.firstChild.nodeValue
        return result

我能够获取到结果“er,er.gif,er,er.gif,”直到程序到达 <LandmarkPointListXml>

3 个回答

0

我之前的回答还不够完整。这里是我认为应该没问题的答案。

import sys
import socket
import os
from xml.dom.minidom import Document, parse, parseString, Node

class mapDataClass:

def __init__(self):
    self.XMLdoc = Document()
    self.MakeRootNode()

def MakeRootNode(self):
    self.RootNode = self.XMLdoc.createElement('Map')
    self.XMLdoc.appendChild(self.RootNode)

def GetXML_Doc(self):
    return self.XMLdoc

def LoadXMLFile(self, AbsFileName):
    try:
        self.XMLdoc.unlink()
        self.XMLdoc = parse(AbsFileName)
        if (self.XMLdoc.hasChildNodes()): #if not empty
            if (cmp(self.XMLdoc.firstChild.nodeName,
                                'Map') == 0):
                self.RootNode = self.XMLdoc.firstChild

        return True

    except IOError:
        print 'File ' + AbsFileName + ' not found'
        return False

def GetMapData(self):
    result = ""
    result1 = ""
    result2 = ""
    haha = self.XMLdoc.firstChild #root node
    for child in haha.childNodes:
        if child.nodeType == Node.ELEMENT_NODE:
            if cmp(child.nodeName, 'LandmarkPointListXml')<>0 and cmp(child.nodeName, 'RegionPointListXml')<>0:
                if cmp(child.nodeName, 'URL')==0:
                    result = result + child.firstChild.nodeValue       
                else:
                    result = result + child.firstChild.nodeValue + ','
            elif cmp(child.nodeName, 'LandmarkPointListXml')==0:
                if child.firstChild is not None:
                    result1 = self.loopLandmark(child)
                else:
                    result1 = 'EMPTY|'
            elif cmp(child.nodeName, 'RegionPointListXml')==0:
                if child.firstChild is None:
                    result2 =  'EMPTY'

    result = result + "|" + result1 + result2
    return result

def loopLandmark(self,landmarks):
    result2=""
    tempResult=""
    haha=landmarks.getElementsByTagName('anyType')
    for i in range(0, len(haha)):
        result2=self.loopAnyType(haha[i])
        if ((i+1)!=len(haha)):
            tempResult = tempResult + result2 + ';'
        else:
            tempResult = tempResult + result2 + '|'
    return tempResult

def loopAnyType(self,anyType):
    result3=""
    haha1=anyType.getElementsByTagName('LandmarkPointX')[0]
    haha2=anyType.getElementsByTagName('LandmarkPointY')[0]
    haha3=anyType.getElementsByTagName('LandmarkDesc')[0]
    result3 = haha1.firstChild.nodeValue + "," + haha2.firstChild.nodeValue + "," + haha3.firstChild.nodeValue
    return result3

data = mapDataClass()
success = data.LoadXMLFile("upload\homeTest.m")
if success:
    print "file loaded"
    print data.GetMapData()
else:
    print "no such file found"
0

我成功地从发布的XML文件中提取出了数据。但是我觉得我的方法可以更简单一些。我用了很多循环来获取每一条数据。

import sys
import socket
import os
from xml.dom.minidom import Document, parse, parseString

class mapDataClass:

def __init__(self):
    self.XMLdoc = Document()
    self.MakeRootNode()

def MakeRootNode(self):
    self.RootNode = self.XMLdoc.createElement('Map')
    self.XMLdoc.appendChild(self.RootNode)

def GetXML_Doc(self):
    return self.XMLdoc

def LoadXMLFile(self, AbsFileName):
    try:
        self.XMLdoc.unlink()
        self.XMLdoc = parse(AbsFileName)
        if (self.XMLdoc.hasChildNodes()): #if not empty
            #Determine if root node <CalibrationData> exist
            if (cmp(self.XMLdoc.firstChild.nodeName,
                                'Map') == 0):
                self.RootNode = self.XMLdoc.firstChild

        return True

    except IOError:
        print 'File ' + AbsFileName + ' not found'
        return False

def GetMapData(self):
    result = ""
    haha = self.XMLdoc.firstChild #root node
    for child in haha.childNodes:
        if cmp(child.nodeName, 'LandmarkPointListXml')==0:
            result1 = self.loopLandmark(child)
        elif cmp(child.nodeName, 'RegionPointListXml')==0:
            print 'Empty'
        elif cmp(child.nodeName, 'URL')==0:
            result = result + child.firstChild.nodeValue
        else:
            result = result + child.firstChild.nodeValue + ','
    result = result + "|" + result1 + "EMPTY"
    return result

def loopLandmark(self,landmarks):
    result2=""
    tempResult=""
    haha=landmarks.getElementsByTagName('anyType')
    for i in range(0, len(haha)):
        result2=self.loopAnyType(haha[i])
        if ((i+1)!=len(haha)):
            tempResult = tempResult + result2 + ';'
        else:
            tempResult = tempResult + result2 + '|'
    return tempResult

def loopAnyType(self,anyType):
    result3=""
    haha1=anyType.getElementsByTagName('LandmarkPointX')[0]
    haha2=anyType.getElementsByTagName('LandmarkPointY')[0]
    haha3=anyType.getElementsByTagName('LandmarkDesc')[0]
    result3 = haha1.firstChild.nodeValue + "," + haha2.firstChild.nodeValue + "," + haha3.firstChild.nodeValue
    return result3

profile = mapDataClass()
boolean = profile.LoadXMLFile('upload\er.m')
print boolean
result = profile.GetMapData()
print result
2

这段代码很脆弱。它对XML输入有很强的假设,如果XML被合理地修改了(比如某个标签没有紧跟在另一个标签后面),代码就会出错。

我建议使用一些标准库来解析XML,比如Element Tree(http://docs.python.org/library/xml.etree.elementtree.html)或者lxml(http://lxml.de),这些库也可以帮助你验证你的XML输入。

我下面写的代码使用了Element Tree,并且可以处理你的XML输入(我去掉了父类中的'self'参数)。它还可以容忍(忽略)XML元素中的空值。

import xml.etree.ElementTree as ET

def GetMapData( xmlfile ):
    result = ""
    try:
        tree = ET.parse( xmlfile )
    except IOError, e:
        print "Failure Parsing %s: %s" % (xmlfile, e)
    root = tree.getroot() # root node
    for child in root:
        if ( child.tag == 'LandmarkPointListXml' ):
            result += '|' + loopLandmark(child) + '|'
        elif child.text is not None:
            result += child.text + ','
    return result

def loopLandmark( landmarks ):
    result=""
    for landmark in landmarks:
        if ( landmark.tag == 'anyType' ): # check also xsi:type="LandmarkPointProperty"?
            for child in landmark:
                if ( child.text and child.tag in [ 'LandmarkPointX', 'LandmarkPointY' ] ):
                    result += child.text + ','
    return result

GetMapData( 'xml.in' )

撰写回答