将XML数据转换为数据

2024-06-09 15:19:33 发布

您现在位置:Python中文网/ 问答频道 /正文

我想从XML中提取数据并将其转换为数据帧

首先,我尝试用元素树xml导入,然后用这段代码打印出一些列,运气不好

所以现在我已经设法用这段代码提取了一些数据


import xml.etree.cElementTree as et
import pandas as pd
def getvalueofnode(node):
    """ return node text or None """
    return node.text if node is not None else None
def main():
    """ main """
    parsed_xml = et.parse("RTS_XTV100006361.xml")
    dfcols = ['start', 'stop', 'channel', 'type', 'title', 'category']
    df_xml = pd.DataFrame(columns=dfcols)
    for node in parsed_xml.getroot():
        start = node.attrib.get('start')
        stop = node.attrib.get('stop')
        channel = node.attrib.get('channel')
        type = node.attrib.get('type')
        title = node.find('title')
        category = node.find('category')
        df_xml = df_xml.append(
            pd.Series([start, stop, channel, type, getvalueofnode(category), getvalueofnode(title)], index=dfcols),
            ignore_index=True)
    print (df_xml)'

但标题和类别仍然为空

以下是XML的示例:

    <?xml version="1.0" encoding="utf-8"?>
<tv xmlns="http://www.xmltv.org/xmltv" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  <programme ID="58135" start="20200528000000 +0200" stop="20200528013000 +0200" channel="XTV100006361" recordable="Y" npvrenable="Y" cpvrenable="N" type="program" deleted="false">
    <blackoutInfo>
      <isBlackout>1</isBlackout>
      <groupIDs>2</groupIDs>
      <streams>00000001</streams>
    </blackoutInfo>
    <title lang="sr"><![CDATA[Yoga retreat]]></title>
    <category lang="sr"><![CDATA[Entertainment]]></category>
    <icon src="default_playbill.png" ptype="3" />
    <episode-num system="xmltv_ns">0.0.0</episode-num>
  </programme>
  <programme ID="58136" start="20200528013000 +0200" stop="20200528030000 +0200" channel="XTV100006361" recordable="Y" npvrenable="Y" cpvrenable="N" type="program" deleted="false">
    <blackoutInfo>
      <isBlackout>1</isBlackout>
      <groupIDs>2</groupIDs>
      <streams>00000001</streams>
    </blackoutInfo>
    <title lang="sr"><![CDATA[Doctor Who]]></title>
    <category lang="sr"><![CDATA[Entertainment]]></category>
    <icon src="default_playbill.png" ptype="3" />
    <episode-num system="xmltv_ns">0.0.0</episode-num>
  </programme>
</tv>

我想要的理想输出是我得到以下列:

channel, start, stop, title, category

desired output


Tags: nodedfgettitletypechannelxmlstart
2条回答

XML文档使用默认名称空间(http://www.xmltv.org/xmltv

一种方法是使用带有通配符({*})的findall()find()作为名称空间。这适用于Python 3.8

import xml.etree.ElementTree as et
import pandas as pd

def getvalueofnode(node):
    return node.text if node is not None else None

def main():
    parsed_xml = et.parse("RTS_XTV100006361.xml")
    dfcols = ['start', 'stop', 'channel', 'type', 'title', 'category']
    df_xml = pd.DataFrame(columns=dfcols)

    for programme in parsed_xml.findall("{*}programme"):
        start = programme.get('start')
        stop = programme.get('stop')
        channel = programme.get('channel')
        type = programme.get('type')

        title = programme.find('{*}title')
        category = programme.find('{*}category')

        df_xml = df_xml.append(
            pd.Series([start, stop, channel, type,
                       getvalueofnode(title), getvalueofnode(category)], index=dfcols),
            ignore_index=True)

    pd.options.display.width = 0
    print (df_xml)

main()

结果:

                  start                  stop       channel     type         title       category
0  20200528000000 +0200  20200528013000 +0200  XTV100006361  program  Yoga retreat  Entertainment
1  20200528013000 +0200  20200528030000 +0200  XTV100006361  program    Doctor Who  Entertainment

如@mzjn所示,XML有一个默认名称空间;你必须在计算中使用它

import xml.etree.ElementTree as ET

def main():
    root = ET.parse("RTS_XTV100006361.xml")
    #declare namespace
    ns = '{http://www.xmltv.org/xmltv}'
    
    #required keys
    keys = ['channel','start','stop','type']
    box = []
    for ent in root.findall(f".//{ns}programme"):
        d = {key:value for key,value in ent.attrib.items() if key in keys}
        #attach title and category to dictionary
        d.update({"title" : ent.find(f".//{ns}title").text,
                  "category" : ent.find(f".//{ns}category").text
                 })
        box.append(d)
        
    return pd.DataFrame(box)

main()

           start                  stop               channel         type   title            category
0   20200528000000 +0200    20200528013000 +0200    XTV100006361    program Yoga retreat    Entertainment
1   20200528013000 +0200    20200528030000 +0200    XTV100006361    program Doctor Who      Entertainment

相关问题 更多 >