使用beauthulsoup解析Python中的XML重复子根

soup = BeautifulSoup(xml_string, "lxml") pub_ref = soup.findAll("publication-reference") with open('./output.csv', 'ab+') as f: writer = csv.writer(f, dialect = 'excel') for info in pub_ref: assign = soup.findAll("assignee") pat_cite = soup.findAll("patcit") for item1 in assign: if item.find("orgname"): org_name = item.find("orgname").text for item2 in pat_cite: if item2.find("name"): name = item2.find("name").text for inv_name, pat_num, cpc_num, class_num, subclass_num, date_num, country, city, state in zip(soup.findAll("invention-title"), soup.findAll("doc-number"), soup.findAll("section"), soup.findAll("class"), soup.findAll("subclass"), soup.findAll("date"), soup.findAll("country"), soup.findAll("city"), soup.findAll("state")): writer.writerow([inv_name.text, pat_num.text, org_name, cpc_num.text, class_num.text, subclass_num.text, date_num.text, country.text, city.text, state.text, name])

invention name country city .... patcit name1 patcit date1.... white space patcit name2 patcit date2.... white space patcit name2 patcit date3....

1条回答

网友

1楼 · 发布于 2024-05-13 02:20:05

试试下面的脚本。我想这就是你想要的。在

from bs4 import BeautifulSoup

xml_content='''
<us-references-cited>
<us-citation>
<patcit num="00001">
<document-id>
<country>US</country>
<doc-number>1589850</doc-number>
<kind>A</kind>
<name>Haskell</name>
<date>19260600</date>
</document-id>
</patcit>
<category>cited by applicant</category>
</us-citation>
<us-citation>
<patcit num="00002">
<document-id>
<country>US</country>
<doc-number>D134414</doc-number>
<kind>S</kind>
<name>Orme, Jr.</name>
<date>19421100</date>
</document-id>
</patcit>
<category>cited by applicant</category>
</us-citation>
<us-citation>
'''
soup = BeautifulSoup(xml_content,"lxml")
for item in soup.select("patcit[num^=000]"):
    name = item.select("name")[0].text
    date = item.select("date")[0].text
    kind = item.select("kind")[0].text
    doc_number = item.select("doc-number")[0].text
    country = item.select("country")[0].text
    print(name,date,kind,doc_number,country)

结果：

^{pr2}$

此解决方案适用于您稍后提供的链接：

import requests
from bs4 import BeautifulSoup

res = requests.get("https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2017/")
soup = BeautifulSoup(res.text,"lxml")
table = soup.select("table")[1]
for items in table.select("tr"):
    data = ' '.join([item.text for item in items.select("td")])
    print(data)

相关问题更多 >

编程相关推荐

热门问题

热门文章