解析Python目录中的所有XML文件

import xml.etree.ElementTree as ET import os directory = "C:/Users/danie/Desktop/NLP/blogs/" def clean_dir(directory): path = os.listdir(directory) print(path) for filename in path: tree = ET.parse(filename) root = tree.getroot() doc_parser(root) post_list = [] def doc_parser(root): for child in root.findall('post'): post_list.append(child.text) clean_dir(directory) print(post_list[0])

File "D:\Anaconda\envs\Deep Learning New\lib\site-packages\IPython\core\interactiveshell.py", line 3326, in run_code exec(code_obj, self.user_global_ns, self.user_ns) File "<ipython-input-91-fce6b0119ea7>", line 1, in <module> runfile('C:/Users/danie/Desktop/NLP/blogs/Parser_Tes.py', wdir='C:/Users/danie/Desktop/NLP/blogs') File "D:\Anaconda\envs\Deep Learning New\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 827, in runfile execfile(filename, namespace) File "D:\Anaconda\envs\Deep Learning New\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 110, in execfile exec(compile(f.read(), filename, 'exec'), namespace) File "C:/Users/danie/Desktop/NLP/blogs/Parser_Tes.py", line 19, in <module> clean_dir(directory) File "C:/Users/danie/Desktop/NLP/blogs/Parser_Tes.py", line 9, in clean_dir tree = ET.parse(filename) File "D:\Anaconda\envs\Deep Learning New\lib\xml\etree\ElementTree.py", line 1196, in parse tree.parse(source, parser) File "D:\Anaconda\envs\Deep Learning New\lib\xml\etree\ElementTree.py", line 597, in parse self._root = parser._parse_whole(source) File "<string>", line unknown ParseError: not well-formed (invalid token): line 103, column 225

['1000331.female.37.indUnk.Leo.xml', '1000866.female.17.Student.Libra.xml', '1004904.male.23.Arts.Capricorn.xml', '1005076.female.25.Arts.Cancer.xml', '1005545.male.25.Engineering.Sagittarius.xml', '1007188.male.48.Religion.Libra.xml', '100812.female.26.Architecture.Aries.xml', '1008329.female.16.Student.Pisces.xml', '1009572.male.25.indUnk.Cancer.xml', '1011153.female.27.Technology.Virgo.xml', '1011289.female.25.indUnk.Libra.xml', '1011311.female.17.indUnk.Scorpio.xml', '1013637.male.17.RealEstate.Virgo.xml', '1015252.female.23.indUnk.Pisces.xml', '1015556.male.34.Technology.Virgo.xml', '1016560.male.41.Publishing.Sagittarius.xml', '1016738.male.26.Publishing.Libra.xml', '1016787.female.24.Communications-Media.Leo.xml', '1019224.female.27.RealEstate.Libra.xml', '1019622.female.24.indUnk.Aquarius.xml', '1019710.male.16.Student.Pisces.xml', '1021779.female.25.indUnk.Scorpio.xml', '1022037.male.23.indUnk.Cancer.xml', '1022086.female.17.Student.Cancer.xml', '1024234.female.17.indUnk.Libra.xml', '1025783.female.17.Student.Gemini.xml', '1026164.female.23.Education.Aries.xml', '1026443.female.15.Student.Scorpio.xml', '1028027.female.16.indUnk.Libra.xml', '1028257.male.26.Education.Aries.xml', '1029959.male.17.indUnk.Aries.xml', '1031806.male.17.Technology.Sagittarius.xml', '1032153.male.27.Technology.Pisces.xml', '1032591.female.24.Banking.Aquarius.xml', '1032824.female.15.Student.Libra.xml', '1034874.female.43.Publishing.Capricorn.xml', '1039136.male.24.Student.Capricorn.xml', '1039908.female.16.indUnk.Gemini.xml', '1040084.male.17.indUnk.Taurus.xml', '1042993.male.15.Student.Sagittarius.xml', '1043329.male.23.Government.Pisces.xml', '1043569.male.26.indUnk.Virgo.xml', '1043785.female.26.Biotech.Leo.xml', '1044338.female.23.Student.Leo.xml', '1045289.female.25.Arts.Aquarius.xml', '1045316.male.27.Non-Profit.Capricorn.xml', '1045831.male.23.Student.Libra.xml', '1046946.female.25.Arts.Virgo.xml', '1047241.male.16.indUnk.Aries.xml', '1050060.female.24.Student.Pisces.xml', '1051122.female.17.Student.Libra.xml', '1052611.male.23.Student.Aries.xml', '1054833.female.24.indUnk.Scorpio.xml', '1055228.female.16.Student.Cancer.xml', '1056232.female.17.indUnk.Aquarius.xml', '1056581.female.26.indUnk.Leo.xml', ....]

ERROR ON FILE: 669116.female.26.indUnk.Gemini.xml ERROR ON FILE: 669514.female.27.indUnk.Sagittarius.xml ERROR ON FILE: 669656.female.23.Advertising.Aries.xml ERROR ON FILE: 669719.male.26.Science.Taurus.xml ERROR ON FILE: 669764.female.17.indUnk.Sagittarius.xml ERROR ON FILE: 670277.female.27.Education.Sagittarius.xml ERROR ON FILE: 670314.male.24.indUnk.Leo.xml ERROR ON FILE: 670684.male.24.Student.Libra.xml ERROR ON FILE: 671748.male.27.Communications-Media.Aries.xml ERROR ON FILE: 673093.male.27.Construction.Scorpio.xml ERROR ON FILE: 673235.male.37.Internet.Capricorn.xml ERROR ON FILE: 67459.male.34.Arts.Capricorn.xml ERROR ON FILE: 674684.female.23.Religion.Libra.xml

1条回答

网友

1楼 · 发布于 2024-06-17 11:44:03

@Kevin的评论是正确的，这个错误与^{} object无法正确解析文档有关。有些东西不是“真的XML”，它可能只是一个奇怪的、非unicode字符之类的简单东西。你知道吗

您可以尝试执行以下操作来帮助调试：

import xml.etree.ElementTree as ET
import os
directory = "C:/Users/danie/Desktop/NLP/blogs/"

def clean_dir(directory):
    path = os.listdir(directory)
    print(path) 
    for filename in path:
        try:
            tree = ET.parse(filename)
            root = tree.getroot()
            doc_parser(root)
        except:
            print("ERROR ON FILE: {}".format(filename))


post_list = []
def doc_parser(root):
    for child in root.findall('post'):
        post_list.append(child.text)

clean_dir(directory)
print(post_list[0])

添加^{}语句将尝试每个文件，如果有错误，则打印出导致错误的文件。你知道吗

我没有任何数据要测试，但这应该可以修复错误。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章