Python解析XML文件并将每个节点另存为fi

2024-05-26 11:55:29 发布

您现在位置:Python中文网/ 问答频道 /正文

所以我想解析一个包含多个节点(和多个名称空间)的XML文件。 目标是提取每个名为CodeList的节点,并将每个节点保存在一个新的xml文件下,该文件名为每个节点的OID属性。在

下面是对原始XML的提取:

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="../xsl/controlledterminology1-0-0.xsl"?>
<ODM xmlns="http://www.cdisc.org/ns/odm/v1.3"
  xmlns:xs="http://www.w3.org/2001/XMLSchema-instance"
  xmlns:nciodm="http://ncicb.nci.nih.gov/xml/odm/EVS/CDISC"
  xs:schemaLocation="http://www.nci.nih.gov/EVS/CDISC ../schema/controlledterminology1-0-0.xsd"
  FileType="Snapshot"
  FileOID="CDISC_CT.SDTM.2011-06-10"
  Granularity="Metadata"
  CreationDateTime="2011-06-07T07:35:51"
  AsOfDateTime="2011-06-10T00:00:00"
  ODMVersion="1.3.1"
  Originator="CDISC XML Technologies Team (SAS 9.02.02M3P04132010)"
  SourceSystem="NCI Thesaurus"
  SourceSystemVersion="2011-06-10">
  <Study OID="CDISC_CT.SDTM.2011-06-10">
    <GlobalVariables>
      <StudyName>CDISC SDTM ControlledTerminology</StudyName>
      <StudyDescription>CDISC SDTM Controlled Terminology, 2011-06-10</StudyDescription>
      <ProtocolName>CDISC SDTM Controlled Terminology</ProtocolName>
    </GlobalVariables>
    <MetaDataVersion OID="CDISC_CT_MetaDataVersion.SDTM.2011-06-10"
      Name="CDISC SDTM Controlled Terminology"
      Description="CDISC SDTM Controlled Terminology, 2011-06-10">
      <CodeList OID="CL.C66767.ACN" Name="Action Taken with Study Treatment" DataType="text" nciodm:ExtCodeID="C66767" nciodm:CodeListExtensible="No">
        <Description>
          <TranslatedText xml:lang="en">Action Taken with Study Treatment</TranslatedText>
        </Description>
        <EnumeratedItem CodedValue="DOSE INCREASED" nciodm:ExtCodeID="C49503">
          <nciodm:CDISCDefinition>An indication that a medication schedule was modified by addition; either by changing the frequency, strength or amount. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Dose Increased</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="DOSE NOT CHANGED" nciodm:ExtCodeID="C49504">
          <nciodm:CDISCDefinition>An indication that a medication schedule was maintained. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Dose Not Changed</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="DOSE REDUCED" nciodm:ExtCodeID="C49505">
          <nciodm:CDISCDefinition>An indication that a medication schedule was modified by subtraction, either by changing the frequency, strength or amount. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Dose Reduced</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="DRUG INTERRUPTED" nciodm:ExtCodeID="C49501">
          <nciodm:CDISCDefinition>An indication that a medication schedule was modified by temporarily terminating a prescribed regimen of medication. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Drug Interrupted</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="DRUG WITHDRAWN" nciodm:ExtCodeID="C49502">
          <nciodm:CDISCDefinition>An indication that a medication schedule was modified through termination of a prescribed regimen of medication. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Drug Withdrawn</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="NOT APPLICABLE" nciodm:ExtCodeID="C48660">
          <nciodm:CDISCSynonym>NA</nciodm:CDISCSynonym>
          <nciodm:CDISCDefinition>Determination of a value is not relevant in the current context. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Not Applicable</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="UNKNOWN" nciodm:ExtCodeID="C17998">
          <nciodm:CDISCSynonym>U</nciodm:CDISCSynonym>
          <nciodm:CDISCSynonym>Unknown</nciodm:CDISCSynonym>
          <nciodm:CDISCDefinition>Not known, not observed, not recorded, or refused. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Unknown</nciodm:PreferredTerm>
        </EnumeratedItem>
        <nciodm:CDISCSubmissionValue>ACN</nciodm:CDISCSubmissionValue>
        <nciodm:CDISCSynonym>Action Taken with Study Treatment</nciodm:CDISCSynonym>
        <nciodm:PreferredTerm>CDISC SDTM Action Taken with Study Treatment Terminology</nciodm:PreferredTerm>
      </CodeList>
      <CodeList OID="CL.C66768.OUT" Name="Outcome of Event" DataType="text" nciodm:ExtCodeID="C66768" nciodm:CodeListExtensible="No">
        <Description>
          <TranslatedText xml:lang="en">A condition or event that is attributed to the adverse event and is the result or conclusion of the adverse event. (NCI)</TranslatedText>
        </Description>
        <EnumeratedItem CodedValue="FATAL" nciodm:ExtCodeID="C48275">
          <nciodm:CDISCSynonym>Grade 5</nciodm:CDISCSynonym>
          <nciodm:CDISCSynonym>5</nciodm:CDISCSynonym>
          <nciodm:CDISCDefinition>The termination of life as a result of an adverse event. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Death Related to Adverse Event</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="NOT RECOVERED/NOT RESOLVED" nciodm:ExtCodeID="C49494">
          <nciodm:CDISCDefinition>One of the possible results of an adverse event outcome that indicates that the event has not improved or recuperated. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Not Recovered or Not Resolved</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="RECOVERED/RESOLVED" nciodm:ExtCodeID="C49498">
          <nciodm:CDISCDefinition>One of the possible results of an adverse event outcome that indicates that the event has improved or recuperated. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Recovered or Resolved</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="RECOVERED/RESOLVED WITH SEQUELAE" nciodm:ExtCodeID="C49495">
          <nciodm:CDISCDefinition>One of the possible results of an adverse event outcome where the subject recuperated but retained pathological conditions resulting from the prior disease or injury. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Recovered or Resolved with Sequelae</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="RECOVERING/RESOLVING" nciodm:ExtCodeID="C49496">
          <nciodm:CDISCDefinition>One of the possible results of an adverse event outcome that indicates that the event is improving. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Recovering or Resolving</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="UNKNOWN" nciodm:ExtCodeID="C17998">
          <nciodm:CDISCSynonym>U</nciodm:CDISCSynonym>
          <nciodm:CDISCSynonym>Unknown</nciodm:CDISCSynonym>
          <nciodm:CDISCDefinition>Not known, not observed, not recorded, or refused. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Unknown</nciodm:PreferredTerm>
        </EnumeratedItem>
        <nciodm:CDISCSubmissionValue>OUT</nciodm:CDISCSubmissionValue>
        <nciodm:CDISCSynonym>Outcome of Event</nciodm:CDISCSynonym>
        <nciodm:PreferredTerm>CDISC SDTM Adverse Event Outcome Terminology</nciodm:PreferredTerm>
      </CodeList>
      <CodeList OID="CL.C66780.AGESPAN" Name="Age Span" DataType="text" nciodm:ExtCodeID="C66780" nciodm:CodeListExtensible="Yes">
        <Description>
          <TranslatedText xml:lang="en">Subgroups of populations based on age. (NCI)</TranslatedText>
        </Description>
        <EnumeratedItem CodedValue="ADOLESCENT (12-17 YEARS)" nciodm:ExtCodeID="C27954">
          <nciodm:CDISCDefinition>A juvenile between the onset of puberty and maturity; in the state of development between puberty and maturity. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Adolescent</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="ADULT (18-65)" nciodm:ExtCodeID="C49685">
          <nciodm:CDISCDefinition>A person from 18 years to 65 years of age. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Adult 18-65 Years Old</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="CHILDREN (2-11 YEARS)" nciodm:ExtCodeID="C49683">
          <nciodm:CDISCDefinition>A person from 2 years to 11 years of age. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Children 2-11 Years Old</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="ELDERLY (&gt; 65)" nciodm:ExtCodeID="C16268">
          <nciodm:CDISCDefinition>An age group comprised by people 65 years of age and older. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Elderly</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="IN UTERO" nciodm:ExtCodeID="C49641">
          <nciodm:CDISCDefinition>The period of time during which the embryo or fetus is present in the uterus of the female. Also describes the location of the embryo or fetus as being in the uterus in contrast to outside the uterus (ex utero). (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>In Utero</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="INFANT AND TODDLER (28 DAYS - 23 MONTHS)" nciodm:ExtCodeID="C49643">
          <nciodm:CDISCDefinition>A person from 28 days to 23 months of age. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Infant And Toddler</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="NEWBORN (0-27 DAYS)" nciodm:ExtCodeID="C16731">
          <nciodm:CDISCDefinition>An infant during the first month after birth. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Newborn</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="PRETERM NEWBORN INFANTS" nciodm:ExtCodeID="C49642">
          <nciodm:CDISCDefinition>An infant born prior to completion of the normal gestation period. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Preterm Newborn Infant</nciodm:PreferredTerm>
        </EnumeratedItem>
        <nciodm:CDISCSubmissionValue>AGESPAN</nciodm:CDISCSubmissionValue>
        <nciodm:CDISCSynonym>Age Span</nciodm:CDISCSynonym>
        <nciodm:PreferredTerm>CDISC SDTM Age Group Terminology</nciodm:PreferredTerm>
      </CodeList>
      <CodeList OID="CL.C66781.AGEU" Name="Age Unit" DataType="text" nciodm:ExtCodeID="C66781" nciodm:CodeListExtensible="No">
        <Description>
          <TranslatedText xml:lang="en">Those units of time that are routinely used to express the age of a person. (NCI)</TranslatedText>
        </Description>
        <EnumeratedItem CodedValue="DAYS" nciodm:ExtCodeID="C25301">
          <nciodm:CDISCDefinition>The time for Earth to make a complete rotation on its axis; ordinarily divided into twenty-four hours. This also refers to a specific day. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Day</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="HOURS" nciodm:ExtCodeID="C25529">
          <nciodm:CDISCDefinition>A unit measure of time equal to 3,600 seconds or 60 minutes. It is approximately 1/24 of a median day. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Hour</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="MONTHS" nciodm:ExtCodeID="C29846">
          <nciodm:CDISCDefinition>One of the 12 divisions of a year as determined by a calendar. It corresponds to the unit of time of approximately to one cycle of the moon's phases, about 30 days or 4 weeks. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Month</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="WEEKS" nciodm:ExtCodeID="C29844">
          <nciodm:CDISCDefinition>Any period of seven consecutive days. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Week</nciodm:PreferredTerm>
        </EnumeratedItem>
        <EnumeratedItem CodedValue="YEARS" nciodm:ExtCodeID="C29848">
          <nciodm:CDISCDefinition>The period of time that it takes for Earth to make a complete revolution around the sun, approximately 365 days; a specific one year period. (NCI)</nciodm:CDISCDefinition>
          <nciodm:PreferredTerm>Year</nciodm:PreferredTerm>
        </EnumeratedItem>
        <nciodm:CDISCSubmissionValue>AGEU</nciodm:CDISCSubmissionValue>
        <nciodm:CDISCSynonym>Age Unit</nciodm:CDISCSynonym>
        <nciodm:PreferredTerm>CDISC SDTM Age Unit Terminology</nciodm:PreferredTerm>
      </CodeList>
    </MetaDataVersion>
  </Study>
</ODM>

在这里,我想得到一个文件,每个代码表看起来像这样,文件将是CL.C66767。ACN.xml公司公司名称:

^{pr2}$

我试着用ElementTree做,但是没有得到任何结果。。。我在寻找好的解决方案吗?在

以下是我使用的代码:

import xml.etree.ElementTree as ET
from lxml import etree
from xml.etree.ElementTree import Element, SubElement
from xml.dom import minidom
import sys

tree = ET.parse('SDTM Terminology 2011-06-10.odm.xml')  
root = tree.getroot()

def prettify(elem):
    """Retuen a pretty-printed XML string for the Element
    """
    rough_string = ET.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent = " ")

def copy_tree( tree_root ):
    return ET.ElementTree( tree_root );

top = Element('ODM')

cdl_root = ET.Element('CodeList')
new_tree = ET.ElementTree(cdl_root)

for node in root.findall(".//{http://www.cdisc.org/ns/odm/v1.3}CodeList[@OID='CL.C66768.OUT']"):
    cdl_root.set('OID', node.attrib)
    #print(node.attrib)

#for node in root.findall(".//{http://www.cdisc.org/ns/odm/v1.3}CodeList"):
    #print (copy_tree(node))
#    new_tree = node 
    new_tree.write(open('CL.C66768.OUT','w'))

#   print(node.tag)
#    codelistoid = node.tag
#    for type in node.getchildren():
#        print ("TAG: ")
#        print(type.tag)
#        #print ("ATTRIBUTES: ")
#        #print(type.attrib)
#        print ("TEXT: ")
#        print(type.text)
#    print("-------------------------------")
#scr_tag = tree.find('.//CodeList[OID="CL.C66768.OUT"]')
#root.append(scr_tag)

#child = SubElement(top, codelistoid)
#child.text = 'This is a CodeList'

#print(prettify(top))

Tags: orofthetothatncicdisccodelist
1条回答
网友
1楼 · 发布于 2024-05-26 11:55:29

这个小剧本很管用

from lxml import etree
# parse the file
root = etree.parse("/home/me/tmp/test.xml")

# important! register default namespace (i.e. no prefix) with a prefix, h in this case
ns = {"h" : "http://www.cdisc.org/ns/odm/v1.3"}

# get a single element for a particular OID
# cl = root.xpath("//h:CodeList[@OID='CL.C66767.ACN']", namespaces=ns)

# get the list of elements
cl = root.xpath("//h:CodeList", namespaces=ns)

for t in cl:
    # get the attribute value
    att = t.xpath("./@OID")
    print(att[0])
    # get the element tree
    et = etree.ElementTree(t)
    # write it to file
    et.write(att[0] + '.OUT', pretty_print=True)

一般来说,lxml offers better capabilities比ElementTree好,不需要混合它们。在

lxml.etree offers a lot more functionality, such as XPath, XSLT, Relax NG, and XML Schema support, which ElementTree does not offer.

相关问题 更多 >