通过python（Py2neo）将大型数据集转录到Neo4j中

import pandas as pd import csv import math import allel import zarr from py2neo import Graph, Node, Relationship, NodeMatcher zarr_path = '.../chroms.zarr' callset = zarr.open_group(zarr_path, mode='r') samples = callset[chrom]['samples'] graph = Graph(user="neo4j", password="password") chrom_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,'X'] for chrom in chrom_list: variants = allel.VariantChunkedTable(callset[chrom]['variants'], names=['AC','AF_AFR', 'AF_AMR', 'AF_ASN', 'AF_EUR', 'AF_MAX', 'CGT', 'CLR', 'CSQ', 'DP', 'DP4', 'ESP_MAF', 'FILTER_LowQual', 'FILTER_MinHWE', 'FILTER_MinVQSLOD', 'FILTER_PASS', 'HWE', 'ICF', 'ID', 'IS', 'PC2', 'PCHI2', 'POS', 'PR', 'QCHI2', 'QUAL', 'REF', 'ALT', 'INDEL', 'SHAPEIT', 'SNP_ID', 'TYPE', 'UGT', 'VQSLOD', 'dbSNPmismatch', 'is_snp', 'numalt'], index='POS') pos = variants['POS'][:] SNPid = variants['ID'][:] ref = variants['REF'][:] alt = variants['ALT'][:] dp = variants['DP'][:] ac = variants['AC'][:] vartype = variants['TYPE'][:] qual = variants['QUAL'][:] vq = variants['VQSLOD'][:] numalt = variants['numalt'][:] csq = variants['CSQ'][:] vcfv = 'VCFv4.1' refv = 'file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta' dpz = callset[chrom]['calldata/DP'] psz = callset[chrom]['calldata/PS'] plz = callset[chrom]['calldata/PL'] gpz = callset[chrom]['calldata/GP'] calldata = callset[chrom]['calldata'] gt = allel.GenotypeDaskArray(calldata['GT']) hap = gt.to_haplotypes() hap1 = hap[:, ::2] hap2 = hap[:, 1::2] i = 0 for i in range(len(samples)): subject = samples[i] subject_node = matcher.match("Subject", subject_id= subject) if subject_node.first() is None: continue seq_tech = 'Illumina HiSeq 2000 (ILLUMINA)' dp = dpz[:, i] ps = psz[:, i] pl = plz[:, i] gp = gpz[:, i] list_h1 = hap1[:, i].compute() list_h2 = hap2[:, i].compute() chrom_label = "Chromosome_" + str(chrom) j = 0 for j in range(len(pos)): h1 = int(list_h1[j]) h2 = int(list_h2[j]) read_depth = int(dp[j]) ps1 = int(ps[j]) PL0 = int(pl[j][0]) PL1 = int(pl[j][1]) PL2 = int(pl[j][2]) genotype = str(h1) + '|' + str(h2) GP0 = float(gp[j][0]) GP1 = float(gp[j][1]) GP2 = float(gp[j][2]) k = int(pos[j]) l = str(ref[j]) m = str(alt[j][h1-1]) o = str(alt[j][h2-1]) if h1 == 0 and h2 == 0: a1 = matcher.match(chrom_label, "Allele", pos= k, bp = l) r2 = Relationship(subject_node.first(), "Homozygous", a1.first(), HTA=h1, HTB=h2, GT=genotype, seq_tech=seq_tech, dp=read_depth, phase_set=ps1, PL0=PL0, PL1=PL1, PL2=PL2, GP0=GP0, GP1=GP1, GP2=GP2) graph.create(r2) elif h1 == 0 and h2 > 0: a1 = matcher.match(chrom_label, "Allele", pos= k, bp = l) r2 = Relationship(subject_node.first(), "Heterozygous", a1.first(), HTA=h1, GT=genotype, seq_tech=seq_tech, dp=read_depth, phase_set=ps1, PL0=PL0, PL1=PL1, PL2=PL2, GP0=GP0, GP1=GP1, GP2=GP2) graph.create(r2) a2 = matcher.match(chrom_label, "Allele", pos= k, bp = o) r3 = Relationship(subject_node.first(), "Heterozygous", a2.first(), HTB=h2, GT=genotype, seq_tech=seq_tech, dp=read_depth, phase_set=ps1, PL0=PL0, PL1=PL1, PL2=PL2, GP0=GP0, GP1=GP1, GP2=GP2) graph.create(r3) elif h1 > 0 and h2 == 0: a1 = matcher.match(chrom_label, "Allele", pos= k, bp = m) r2 = Relationship(subject_node.first(), "Heterozygous", a1.first(), HTA=h1, GT=genotype, seq_tech=seq_tech, dp=read_depth, phase_set=ps1, PL0=PL0, PL1=PL1, PL2=PL2, GP0=GP0, GP1=GP1, GP2=GP2) graph.create(r2) a2 = matcher.match(chrom_label, "Allele", pos= k, bp = l) r3 = Relationship(subject_node.first(), "Heterozygous", a2.first(), HTB=h2, GT=genotype, seq_tech=seq_tech, dp=read_depth, phase_set=ps1, PL0=PL0, PL1=PL1, PL2=PL2, GP0=GP0, GP1=GP1, GP2=GP2) graph.create(r3) elif h1 == h2 and h1 > 0: a1 = matcher.match(chrom_label, "Allele", pos= k, bp = m) r2 = Relationship(subject_node.first(), "Homozygous", a1.first(), HTA = h1, HTB = h2, GT=genotype, seq_tech=seq_tech, dp=read_depth, phase_set=ps1, PL0=PL0, PL1=PL1, PL2=PL2, GP0=GP0, GP1=GP1, GP2=GP2) graph.create(r2) else: a1 = matcher.match(chrom_label, "Allele", pos= k, bp = m) r2 = Relationship(subject_node.first(), "Heterozygous", a1.first(), HTA=h1, GT=genotype, seq_tech=seq_tech, dp=read_depth, phase_set=ps1, PL0=PL0, PL1=PL1, PL2=PL2, GP0=GP0, GP1=GP1, GP2=GP2) graph.create(r2) a2 = matcher.match(chrom_label, "Allele", pos= k, bp = o) r3 = Relationship(subject_node.first(), "Heterozygous", a2.first(), HTB=h2, GT=genotype, seq_tech=seq_tech, dp=read_depth, phase_set=ps1, PL0=PL0, PL1=PL1, PL2=PL2, GP0=GP0, GP1=GP1, GP2=GP2) graph.create(r3) print("Subject " + subject + " completed.") print(chrom_label + "completed.")

1条回答

网友

1楼 · 发布于 2024-05-15 12:30:04

单独创建大量元素总是很慢，这主要是因为需要的网络跳数。您还将在每一次之间进行匹配，这将进一步增加时间

解决这类问题的最佳方法是查看批处理，包括读取和写入。虽然您也不能同时完成所有操作，但一次将操作批处理成至少几百个操作将产生显著的效果。在您的情况下，您可能需要执行大容量读取，然后执行大容量写入，等等

因此，具体地说，可以针对多个实体进行匹配（您可以使用“in”修饰符，或者您可能需要使用原始密码）。对于写操作，使用相关节点和关系在本地构建一个子图，并在单个调用中创建该子图

您的最佳批量大小只能通过实验来发现，所以您可能不会第一次就得到正确的结果。但批处理无疑是这里的关键

相关问题更多 >

编程相关推荐

热门问题

热门文章