Python：比较两个json文件并替换相似的字符串

{ "US-8163793-B2": { "publication_date": "20120424", "priority_date": "20090420", "family_id": "42261969", "country_code": "US", "ipc_code": "C07D417/14", "cpc_code": "C07D471/04", "assignee_name": "Hoffman-La Roche Inc.", "title": "Proline derivatives", "abstract": "The invention relates to a compound of formula (I) wherein A, R 1 -R 6 are as defined in the description and in the claims. The compound of formula (I) can be used as a medicament." }

with open(file, "rb") as datafile: json_data = json.loads(datafile.read().decode("utf-8")) # type: object for paper in json_data: termite_dict = dict() termite_dict_all_per_pmid = list() pmid = int(paper["docID"]) abstract = paper["abstract"] gene_list = list() indication_mesh_list = list() drug_list = list() mirna_list = list() company_list = list() bioproc_list = list() protype_list = list() if "termiteTags" in paper: for termite_tag in paper["termiteTags"]: type_entry = termite_tag["entityType"] termite_dict = dict() name = termite_tag["name"] exact_tag_locations = termite_tag["exact_string"].split(",") relevant_tag_locations = list() words_to_replace = list() # process and store termite annotations if type_entry == "GENE": gene_list.append({"Gene": termite_tag["hitID"]}) elif type_entry == "INDICATION": info = termite_tag["entityMeta"] if "mesh_tree" in info: for e in list(filter(None, termite_tag["entityMeta"]["mesh_tree"].split(";"))): try: mesh_id = mesh_tree_nr_to_id_dict[e] mesh_name = mesh_id_to_name_dict[mesh_id] indication_mesh_list.append({"name": mesh_name, "id": mesh_id, "key": e}) except KeyError: continue elif "_ext_uri" in info: url = termite_tag["entityMeta"]["_ext_uri"] try: mesh_id = url.split("term=")[1] mesh_name = mesh_id_to_name_dict[mesh_id] mesh_tree_nr = name_to_mesh_id_dict[mesh_name] indication_mesh_list.append({"name": mesh_name, "id": mesh_id, "key": mesh_tree_nr}) except KeyError: print("Issue with Mesh key indication") elif type_entry == "DRUG": drug_list.append(termite_tag["name"]) elif type_entry == "MIRNA": mirna_list.append(termite_tag["hitID"]) elif type_entry == "COMPANY": company_list.append(termite_tag["name"]) elif type_entry == "BIOPROC": bioproc_list.append(termite_tag["name"]) elif type_entry == "PROTYP": protype_list.append(termite_tag["name"]) # store info for positions with words to normalize in abstract text for hit_number, hit in enumerate(termite_tag["frag_vector_array"]): hit = hit.replace("\n", " ") try: match = re.match(r"^.*{!(.*)!}.*$", hit) match_word = match.group(1) except AttributeError: try: match = re.match(r"^.*{\*(.*)\*\}.*$", hit) match_word = match.group(1) except AttributeError: print(hit) if match_word.lower() != name.lower(): exact_locus = exact_tag_locations[hit_number] if not exact_locus.startswith("-"): # sentence 0 is paper title if not exact_locus.startswith("0"): relevant_tag_locations.append(exact_tag_locations[hit_number]) words_to_replace.append(match_word) termite_dict["norm"] = name termite_dict["replace"] = match_word fr, t = exact_locus.split("#")[1].split("-") termite_dict["from"] = int(fr) termite_dict["to"] = int(t) termite_dict["len"] = int(t) - int(fr) termite_dict["entityCode"] = entity_type_encoder[termite_tag["entityType"]] termite_dict_all_per_pmid.append(termite_dict) termite_dict = dict() # abstract normalization and bag of words calculations if len(termite_dict_all_per_pmid) > 0: sorted_termite_dict_all_per_pmid = sorted(termite_dict_all_per_pmid, key=lambda k: (k['from'], -k["len"], k["entityCode"])) normalized_abstract = normalize_abstract(sorted_termite_dict_all_per_pmid, abstract) termite_dict["Norm_Abstract"] = normalized_abstract cleaned_abstract_text = abstract_to_words(normalized_abstract) termite_dict["bag_of_words"] = list(set(cleaned_abstract_text)) termite_dict["docID"] = pmid if "keywords" in paper: keywords = [w.strip() for w in paper["keywords"].split(";")] mesh_list = list() for word in keywords: if len(word.split(" ")) == 1 and len(word) > 0 and word[0].islower(): word = word.title() if word in name_to_mesh_id_dict: mesh_id = name_to_mesh_id_dict[word] try: mesh_list.append([word, mesh_id, mesh_id_to_tree_nr_dict[mesh_id]]) except KeyError: mesh_list.append([word, mesh_id, ""]) termite_dict["MeshHeadings"] = mesh_list if len(gene_list) > 0: termite_dict["Genes"] = gene_list if len(indication_mesh_list) > 0: termite_dict["Indications"] = indication_mesh_list if len(drug_list) > 0: termite_dict["Drug"] = drug_list if len(mirna_list) > 0: termite_dict["MIRNA"] = mirna_list if len(company_list) > 0: termite_dict["Company"] = company_list if len(bioproc_list) > 0: termite_dict["Bioproc"] = bioproc_list if len(protype_list) > 0: termite_dict["Protyp"] = protype_list # add meta list to be able to query for gene nd indication co-occurrence meta_list = list() if "Indications" in termite_dict: meta_list.extend([indi["key"] for indi in termite_dict["Indications"]]) if "Genes" in termite_dict: meta_list.extend([gene["Gene"] for gene in termite_dict["Genes"]]) if len(meta_list) > 0: termite_dict["all_genes_indications"] = meta_list termite_dict_list.append(termite_dict) return termite_dict_list

1条回答

网友

1楼 · 发布于 2024-04-29 00:37:57

如果我正在跟踪您的目标，我认为您希望将您的专利数据中的"assignee_name"替换为来自贵公司数据的相应的"hitID"，该数据基于包含在专利数据中的公司数据"name"。在

两个循环就可以了（不过我相信还有一种更优雅的方法）。当然，如果您需要更复杂的东西来确定来自公司数据的"name"是否真的与专利数据中的"assignee_name"匹配，那么您可以在这种方法中添加一些regex等，但这应该会让您找到正确的方向。在

import json

patents = json.loads("""{
        "US-8163793-B2": {
            "publication_date": "20120424",
            "assignee_name": "Hoffman-La Roche Inc."
        },
        "US-1234567-A1": {
            "publication_date": "20010101",
            "assignee_name": "ABC Inc."
        }
    }""")

companies = json.loads("""{
        "Row_1": {
            "COMPANY": [
                {
                    "hitID": "COMP642",
                    "name": "Roche"
                }
            ]
        },
        "Row_2": {
            "COMPANY": [
                {
                    "hitID": "COMP123",
                    "name": "ABC"
                }
            ]
        }
    }""")

# loop through companies data
for company in companies.values():
    company_id = company['COMPANY'][0]['hitID']
    company_name = company['COMPANY'][0]['name']

    # update patents where company "name" included in "assignee_name"
    for patent in patents.values():
        if company_name in patent['assignee_name']:
            patent['assignee_name'] = company_id

print(patents)

# OUTPUT (use json.dump to write to file if needed)
#
# {
#     'US-1234567-A1': {'assignee_name': 'COMP123', 'publication_date': '20010101'},
#     'US-8163793-B2': {'assignee_name': 'COMP642', 'publication_date': '20120424'}
# }

相关问题更多 >

编程相关推荐

热门问题

热门文章