我必须用Python创建一个脚本,它允许我替换json文件中的字符串。此文件包含专利信息,例如:
{
"US-8163793-B2": {
"publication_date": "20120424",
"priority_date": "20090420",
"family_id": "42261969",
"country_code": "US",
"ipc_code": "C07D417/14",
"cpc_code": "C07D471/04",
"assignee_name": "Hoffman-La Roche Inc.",
"title": "Proline derivatives",
"abstract": "The invention relates to a compound of formula (I) wherein A, R 1 -R 6 are as defined in the description and in the claims. The compound of formula (I) can be used as a medicament."
}
然而,大约有15000个条目。为了规范化这个文档,在执行单词嵌入之前,我使用了一个软件,在找到的术语中包含标记。输出如下:
^{pr2}$这个输出也是一个json文件,可以用作字典。在
我需要的是,每当这个术语出现在专利文件中时,用"hitID"
,比如"Roche"
,替换成{
我对Python非常陌生,所以任何帮助或阅读推荐都会有很大帮助。在
谢谢你!在
编辑
到目前为止,我们已经尽力了:
with open(file, "rb") as datafile:
json_data = json.loads(datafile.read().decode("utf-8")) # type: object
for paper in json_data:
termite_dict = dict()
termite_dict_all_per_pmid = list()
pmid = int(paper["docID"])
abstract = paper["abstract"]
gene_list = list()
indication_mesh_list = list()
drug_list = list()
mirna_list = list()
company_list = list()
bioproc_list = list()
protype_list = list()
if "termiteTags" in paper:
for termite_tag in paper["termiteTags"]:
type_entry = termite_tag["entityType"]
termite_dict = dict()
name = termite_tag["name"]
exact_tag_locations = termite_tag["exact_string"].split(",")
relevant_tag_locations = list()
words_to_replace = list()
# process and store termite annotations
if type_entry == "GENE":
gene_list.append({"Gene": termite_tag["hitID"]})
elif type_entry == "INDICATION":
info = termite_tag["entityMeta"]
if "mesh_tree" in info:
for e in list(filter(None, termite_tag["entityMeta"]["mesh_tree"].split(";"))):
try:
mesh_id = mesh_tree_nr_to_id_dict[e]
mesh_name = mesh_id_to_name_dict[mesh_id]
indication_mesh_list.append({"name": mesh_name, "id": mesh_id, "key": e})
except KeyError:
continue
elif "_ext_uri" in info:
url = termite_tag["entityMeta"]["_ext_uri"]
try:
mesh_id = url.split("term=")[1]
mesh_name = mesh_id_to_name_dict[mesh_id]
mesh_tree_nr = name_to_mesh_id_dict[mesh_name]
indication_mesh_list.append({"name": mesh_name, "id": mesh_id, "key": mesh_tree_nr})
except KeyError:
print("Issue with Mesh key indication")
elif type_entry == "DRUG":
drug_list.append(termite_tag["name"])
elif type_entry == "MIRNA":
mirna_list.append(termite_tag["hitID"])
elif type_entry == "COMPANY":
company_list.append(termite_tag["name"])
elif type_entry == "BIOPROC":
bioproc_list.append(termite_tag["name"])
elif type_entry == "PROTYP":
protype_list.append(termite_tag["name"])
# store info for positions with words to normalize in abstract text
for hit_number, hit in enumerate(termite_tag["frag_vector_array"]):
hit = hit.replace("\n", " ")
try:
match = re.match(r"^.*{!(.*)!}.*$", hit)
match_word = match.group(1)
except AttributeError:
try:
match = re.match(r"^.*{\*(.*)\*\}.*$", hit)
match_word = match.group(1)
except AttributeError:
print(hit)
if match_word.lower() != name.lower():
exact_locus = exact_tag_locations[hit_number]
if not exact_locus.startswith("-"):
# sentence 0 is paper title
if not exact_locus.startswith("0"):
relevant_tag_locations.append(exact_tag_locations[hit_number])
words_to_replace.append(match_word)
termite_dict["norm"] = name
termite_dict["replace"] = match_word
fr, t = exact_locus.split("#")[1].split("-")
termite_dict["from"] = int(fr)
termite_dict["to"] = int(t)
termite_dict["len"] = int(t) - int(fr)
termite_dict["entityCode"] = entity_type_encoder[termite_tag["entityType"]]
termite_dict_all_per_pmid.append(termite_dict)
termite_dict = dict()
# abstract normalization and bag of words calculations
if len(termite_dict_all_per_pmid) > 0:
sorted_termite_dict_all_per_pmid = sorted(termite_dict_all_per_pmid,
key=lambda k: (k['from'], -k["len"], k["entityCode"]))
normalized_abstract = normalize_abstract(sorted_termite_dict_all_per_pmid, abstract)
termite_dict["Norm_Abstract"] = normalized_abstract
cleaned_abstract_text = abstract_to_words(normalized_abstract)
termite_dict["bag_of_words"] = list(set(cleaned_abstract_text))
termite_dict["docID"] = pmid
if "keywords" in paper:
keywords = [w.strip() for w in paper["keywords"].split(";")]
mesh_list = list()
for word in keywords:
if len(word.split(" ")) == 1 and len(word) > 0 and word[0].islower():
word = word.title()
if word in name_to_mesh_id_dict:
mesh_id = name_to_mesh_id_dict[word]
try:
mesh_list.append([word, mesh_id, mesh_id_to_tree_nr_dict[mesh_id]])
except KeyError:
mesh_list.append([word, mesh_id, ""])
termite_dict["MeshHeadings"] = mesh_list
if len(gene_list) > 0:
termite_dict["Genes"] = gene_list
if len(indication_mesh_list) > 0:
termite_dict["Indications"] = indication_mesh_list
if len(drug_list) > 0:
termite_dict["Drug"] = drug_list
if len(mirna_list) > 0:
termite_dict["MIRNA"] = mirna_list
if len(company_list) > 0:
termite_dict["Company"] = company_list
if len(bioproc_list) > 0:
termite_dict["Bioproc"] = bioproc_list
if len(protype_list) > 0:
termite_dict["Protyp"] = protype_list
# add meta list to be able to query for gene nd indication co-occurrence
meta_list = list()
if "Indications" in termite_dict:
meta_list.extend([indi["key"] for indi in termite_dict["Indications"]])
if "Genes" in termite_dict:
meta_list.extend([gene["Gene"] for gene in termite_dict["Genes"]])
if len(meta_list) > 0:
termite_dict["all_genes_indications"] = meta_list
termite_dict_list.append(termite_dict)
return termite_dict_list
如果我正在跟踪您的目标,我认为您希望将您的专利数据中的
"assignee_name"
替换为来自贵公司数据的相应的"hitID"
,该数据基于包含在专利数据中的公司数据"name"
。在两个循环就可以了(不过我相信还有一种更优雅的方法)。当然,如果您需要更复杂的东西来确定来自公司数据的
"name"
是否真的与专利数据中的"assignee_name"
匹配,那么您可以在这种方法中添加一些regex等,但这应该会让您找到正确的方向。在相关问题 更多 >
编程相关推荐