使用beautifulsoup将多个.html转换为单个csv

import re import csv from bs4 import BeautifulSoup path = r'C:/Users/Mx/Testing/Infod' ext = '.htm' for filename in os.listdir(path): if filename.endswith(ext): fullpath = os.path.join(path, filename) filename = os.path.splitext(os.path.basename(filename))[0] soup = BeautifulSoup(open(fullpath, encoding="utf-8"), 'html.parser') text = soup.get_text() ref = soup.find("td", text="Reference") pattern = re.compile(r'GBBTI\S{9}') IC = soup.find("b", text="Issuing country") cx = IC.findNext("td").contents SD = soup.find("b", text="Start date of validity") SDX = SD.findNext("td").contents ED = soup.find("b", text="End date of validity") EDX = ED.findNext("td").content NC = soup.find("b", text="Nomenclature code") NCX = NC.findNext("td").contents CJ = soup.find("b", text="Classification justification") CJX = CJ.findNext("td").contents L = soup.find("b", text="Language") LX = L.findNext("td").contents POI = soup.find("b", text="Place of issue") POIX = POI.findNext("td").contents DOI = soup.find("b", text="Date of issue") DOIX = DOI.findNext("td").contents NAA = soup.find("b", text="Name and adress") NAAX = NAA.findNext("td").contents DOG = soup.find("b", text="Description of goods") DOGX = DOG.findNext("td").contents NK = soup.find("b", text="National keywords") NKX = NK.findNext("td").contents with open('names.csv', 'w') as csvfile: fieldnames = ['Ref', 'country', 'Start date of Validity', 'End date of validity', 'Nomenclature code', 'Classification justification', 'Language', 'Place of issue', 'Date of issue', 'Name and address', 'Description', 'keywords'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writerow((soup.find('td', text=pattern)),cx, SDX, EDX, NCX, CJX, LX, POIX, DOIX, NAAX, DOGX, NKX) ``` Any advice would be greatly appreciated.

1条回答

网友

1楼 · 发布于 2024-05-12 18:30:34

首先，其中一个问题是，在每一行上，您都会打开文件进行写入，实际上，通过这样做：

open('names.csv', 'w')

在每一行上，您都重写文件（删除以前的数据并写入新数据）。为了防止这种情况并加快整个过程，我建议在循环之前打开它一次（不要忘记关闭它）

此外，由于csv是一种非常简单的格式，我不确定使用某种库来操作它是否真的有用，下面是一个如何实现这一点的示例：

import re
import csv

from bs4 import BeautifulSoup

path = r'C:/Users/Mx/Testing/Infod'

ext = '.htm'

out_file = open('names.csv', 'w')
out_file.write(",".join(['Ref', 'country', 'Start date of Validity', 'End date of validity', 'Nomenclature code', 'Classification justification', 'Language', 'Place of issue', 'Date of issue', 'Name and address', 'Description', 'keywords']) + "\n") # Write your keys

for filename in os.listdir(path):
    
    if filename.endswith(ext):

        ...
        get your data
        ...

        out_file.write(",".join([ """pass here your values in corrent order""" ]) + "\n") # write your values separated by
out_file.close()

相关问题更多 >

编程相关推荐

热门问题

热门文章