插入Postgres时python代码速度慢

2024-04-29 17:15:06 发布

您现在位置:Python中文网/ 问答频道 /正文

我有以下使用PyVCF库的Python代码。这个脚本需要很长时间来解析我的VCF文件。你知道吗

以下是脚本:

# =============================================================================
# import libraries for python 
# =============================================================================
import vcf
import os.path
import psycopg2       
import datetime
import sys

# =============================================================================
# function drops the previous table if exists and creates new table with dynamic table name
# =============================================================================
def create_table(table_name,table_type,conn_obj):
    dbcur = conn_obj.cursor()
    dbcur.execute("""DROP TABLE IF EXISTS {0};""".format(table_name))
    if table_type == '1' :
        dbcur.execute("""CREATE TABLE IF NOT EXISTS {0} (as_ID INT, as_NM TEXT, as_at_ID INT, as_at_NM TEXT, VCF_ID TEXT, VARIANT_ID TEXT, as_at_LINE_SEQ INT, DATE_START DATE, DATE_END DATE, as_at_VAL_SEQ INT, as_at_VA_NM TEXT, as_at_VALUE TEXT);""".format(table_name))
    elif table_type == '2' :
        dbcur.execute("""CREATE TABLE IF NOT EXISTS {0} (as_at_VA_NM TEXT, as_at_VA_DESC TEXT,as_at_DATA_TYPE TEXT);""".format(table_name))     
    conn_obj.commit()
    message = 'Table '+table_name+' created successfully.\n'
    print(message)
    return table_name


# =============================================================================
# function for insert vcf files data in values table
# =============================================================================

def create_tuples(as_id, vcf_id, Variant_id, as_at_nm, sample_id, as_at_va_nm, as_at_value, date_start, date_end, as_va_seq, as_at_va_line_seq,data_table_name):
    as_at_id = '1'
    sample_id = '1'
    variant_id = '1'
    as_nm = 'VCF'
    datalist.append("({0},'{1}','{2}','{3}','{4}',{5},'{6}','{7}','{8}','{9}')".format(as_id,str(as_nm),as_at_id,as_at_nm,"",variant_id,as_at_va_line_seq,as_va_seq,as_at_va_nm,as_at_value))

    if len(datalist)==200000:
        insertdata(data_table_name,as_id)

def insertdata(data_table_name,as_id):
    global datalist
    global execution_start_time
    global index

    iter_datalist=iter(datalist) 
    args_str = ','.join(iter_datalist)
    cursor.execute('INSERT INTO '+data_table_name+' (as_ID,as_NM,as_at_ID,as_at_NM,VCF_ID,VARIANT_ID,as_at_LINE_SEQ,as_at_VAL_SEQ,as_at_VA_NM,as_at_VALUE) VALUES '+args_str) 
    conn.commit()
    datalist=[]
    time_now = datetime.datetime.now()
    time_difference = time_now - execution_start_time
    print('Total inserted records:'+str(index)+' in time '+str(time_difference))


# =============================================================================
# function defination / switch case for get records variable name & variable value
# =============================================================================
def get_header_data(header):
    switcher = {
        "CHROM" : {'variable_name':'CHROM', 'variable_value': record.CHROM},
        "POS" : {'variable_name':'POS', 'variable_value': record.POS},
        "ID" : {'variable_name':'ID', 'variable_value': record.ID},
        "REF" : {'variable_name':'REF', 'variable_value': record.REF},
        "ALT" : {'variable_name':'ALT', 'variable_value': record.ALT},
        "QUAL" : {'variable_name':'QUAL', 'variable_value': record.QUAL},
        "FILTER" : {'variable_name':'FILTER', 'variable_value': record.FILTER},
        "INFO" : {'variable_name':'INFO', 'variable_value': record.INFO},
        "FORMAT" : {'variable_name':'FORMAT', 'variable_value': record.FORMAT}
    } 
    return switcher.get(header, "Invalid header")   
dic={}
def insert_infos_metadata(reader_infos,line_index,file_name,as_at_nm,variable_data_name):
    for info_data in reader_infos:
        va_desc = reader_infos[info_data].desc
        arr_len = len(va_desc.split(':'))
        if arr_len > 1 :
            last_str = va_desc.split(':')[arr_len-1]
            dic[info_data]=last_str
    return line_index



# =============================================================================
# global variable declaration
# =============================================================================
index = 0
variableList = []
datalist  = []
headers_arr = []
totalrecordinserted=0
curentRecord=-1
execution_start_time= datetime.datetime.now()                                       # calculate execution start time

# =============================================================================
# get file path or name from user
# =============================================================================

file_path = "gnomad.vcf"
if os.path.isfile(file_path) :                                                      #Check file is exists or not on given path
    if True:
        conn = psycopg2.connect(host="localhost",database="mydb", user="postgres", password="pgAdmin")
        cursor = conn.cursor()
        data_table_name = "data"
        variable_data_name = "vdata"
        create_table("data","1",conn)
        create_table("vdata","2",conn)

        vcf_reader = vcf.Reader(open(file_path, 'r'))
        column_headers = vcf_reader._column_headers
        line_index = 0
        null_date = None
        as_at_header = 'Header'
        as_at_variant = 'Variant' 
        reader_infos = vcf_reader.infos
        line_index = insert_infos_metadata(reader_infos,line_index,file_path,as_at_header,variable_data_name)

        execution_start_time= datetime.datetime.now()  

        for record in vcf_reader:
            index += 1
            line_index += 1
            if index == 5000:
                break
            Variant_id = ''
            sample_name = ''
            #for header in column_headers : 
            for header in column_headers :
                if header != 'FORMAT' :
                    header_data_dict = get_header_data(header)
                    #print(header_data_dict)
                    header_data_dict_value = header_data_dict['variable_value']
                    #print(header_data_dict_value)
                    if isinstance(header_data_dict_value, str) :
                        create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, header, str(header_data_dict_value), null_date, null_date, 1, str(line_index),data_table_name)
                    elif isinstance(header_data_dict_value, list) :
                        if len(header_data_dict_value) == 0 and header == 'FILTER' :
                            create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, header, 'PASS', null_date, null_date, 1, str(line_index),data_table_name)
                        else :
                            i = 0
                            while i < len(header_data_dict_value) :
                                create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, header, str(header_data_dict_value[i]), null_date, null_date, str(i), str(line_index),data_table_name)
                                i += 1
                    elif isinstance(header_data_dict_value, dict) :
                        for dict_val_record in header_data_dict_value :
                            #print(dict_val_record,'.......',header_data_dict_value[dict_val_record])
                            variable_name = header + '/' + dict_val_record
                            variable_value = header_data_dict_value[dict_val_record]
                            #print('.........',variable_value)
                            variable_seq = 1 
                            if isinstance(variable_value,list) :
                                for value in variable_value :
                                    if dic.has_key(dict_val_record) :  
                                        header_key_description = vcf_reader.infos[dict_val_record].desc
                                        arr_len = len(header_key_description.split(':'))
                                        last_str = header_key_description.split(':')[arr_len-1]
                                        array_obj_desc_headers = last_str.split('|')
                                        arr_obj_val = value.strip("()[]").split("|")
                                        vararray = dic[dict_val_record].split("|")
                                        arr_index = 0
                                        for desc_header_value in arr_obj_val :
                                            variable_name = header+'/'+dict_val_record+'/'+vararray[arr_index].lstrip().replace("'","")
                                            variable_value = arr_obj_val[arr_index]
                                            arr_index += 1
                                            create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, variable_name, variable_value, null_date, null_date, str(variable_seq), str(line_index),data_table_name)
                                    else :
                                          create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, variable_name, value, null_date, null_date, str(variable_seq), str(line_index),data_table_name)

                                    variable_seq += 1  
                            else :
                                create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, variable_name, variable_value, null_date, null_date, str(variable_seq), str(line_index),data_table_name)
                    else :
                        variable_name = header
                        variable_value = str(header_data_dict_value)
                        create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, variable_name, variable_value, null_date, null_date, 1, str(line_index),data_table_name)
                else :
                    format_data_arr = (record.FORMAT).split(":")
                    for format_data_record in format_data_arr :
                        variable_name = header+'/'+format_data_record
                        variable_value = record.genotype(record_data.sample)[format_data_record]
                        create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, variable_name, variable_value, null_date, null_date, 1,str(line_index),data_table_name)

        insertdata(data_table_name,"1")
        print('Variants inserted successfully.')
    else:
        print('Incorrect Choice.')
else:
    print('File not present.')

下面是我们从gnomAD下载的输入VCF文件:

##fileformat=VCFv4.2
##hailversion=0.2.7-c860755b5da3
##INFO=<ID=vep,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|FLAGS|VARIANT_CLASS|MINIMISED|SYMBOL_SOURCE|HGNC_ID|CANONICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|GENE_PHENO|SIFT|PolyPhen|DOMAINS|HGVS_OFFSET|GMAF|AFR_MAF|AMR_MAF|EAS_MAF|EUR_MAF|SAS_MAF|AA_MAF|EA_MAF|ExAC_MAF|ExAC_Adj_MAF|ExAC_AFR_MAF|ExAC_AMR_MAF|ExAC_EAS_MAF|ExAC_FIN_MAF|ExAC_NFE_MAF|ExAC_OTH_MAF|ExAC_SAS_MAF|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|LoF|LoF_filter|LoF_flags|LoF_info">
#CHROM  POS ID  REF ALT QUAL    FILTER  INFO
22  16050036    rs374742143 A   C   442156.34   RF  AC=80;AN=180;AF=4.44444e-01;rf_tp_probability=1.98655e-01;FS=7.05600e+00;InbreedingCoeff=3.82000e-01;MQ=2.71500e+01;MQRankSum=-1.02600e+00;QD=2.47500e+01;ReadPosRankSum=-2.11000e-01;SOR=1.26750e+01;BaseQRankSum=7.36000e-01;ClippingRankSum=2.96000e-01;DP=50165;VQSLOD=-9.58600e+02;VQSR_culprit=MQ;segdup;rf_negative_label;rf_label=FP;rf_train;variant_type=snv;allele_type=snv;n_alt_alleles=1;pab_max=1.00000e+00;gq_hist_alt_bin_freq=15|1348|318|278|171|32|20|31|51|152|82|5|9|13|35|44|8|4|2|41;gq_hist_all_bin_freq=2396|2885|745|685|311|58|38|37|54|152|82|5|9|13|35|44|8|4|2|41;dp_hist_alt_bin_freq=1872|720|58|9|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=13607|1974|114|13|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_all_n_larger=0;ab_hist_alt_bin_freq=0|0|0|2|6|26|42|12|99|3|137|31|63|99|21|66|37|9|1|7;AC_nfe_seu=0;AN_nfe_seu=4;AF_nfe_seu=0.00000e+00;nhomalt_nfe_seu=0;controls_AC_afr_male=0;controls_AN_afr_male=4;controls_AF_afr_male=0.00000e+00;controls_nhomalt_afr_male=0;non_topmed_AC_amr=6;non_topmed_AN_amr=8;non_topmed_AF_amr=7.50000e-01;non_topmed_nhomalt_amr=2;AC_raw=4657;AN_raw=15208;AF_raw=3.06220e-01;nhomalt_raw=1998;AC_fin_female=27;AN_fin_female=44;AF_fin_female=6.13636e-01;nhomalt_fin_female=5;non_neuro_AC_asj_female=1;non_neuro_AN_asj_female=2;non_neuro_AF_asj_female=5.00000e-01;non_neuro_nhomalt_asj_female=0;non_neuro_AC_afr_male=1;non_neuro_AN_afr_male=8;non_neuro_AF_afr_male=1.25000e-01;non_neuro_nhomalt_afr_male=0;AC_afr_male=2;AN_afr_male=10;AF_afr_male=2.00000e-01;nhomalt_afr_male=0;AC_afr=2;AN_afr=14;AF_afr=1.42857e-01;nhomalt_afr=0;non_neuro_AC_afr_female=0;non_neuro_AN_afr_female=2;non_neuro_AF_afr_female=0.00000e+00;non_neuro_nhomalt_afr_female=0;non_topmed_AC_amr_female=3;non_topmed_AN_amr_female=4;non_topmed_AF_amr_female=7.50000e-01;non_topmed_nhomalt_amr_female=1;non_topmed_AC_oth_female=3;non_topmed_AN_oth_female=6;non_topmed_AF_oth_female=5.00000e-01;non_topmed_nhomalt_oth_female=0;AC_eas_female=0;AN_eas_female=0;nhomalt_eas_female=0;AC_afr_female=0;AN_afr_female=4;AF_afr_female=0.00000e+00;nhomalt_afr_female=0;non_neuro_AC_female=16;non_neuro_AN_female=44;non_neuro_AF_female=3.63636e-01;non_neuro_nhomalt_female=2;controls_AC_afr=0;controls_AN_afr=4;controls_AF_afr=0.00000e+00;controls_nhomalt_afr=0;AC_nfe_onf=2;AN_nfe_onf=12;AF_nfe_onf=1.66667e-01;nhomalt_nfe_onf=0;controls_AC_fin_male=3;controls_AN_fin_male=10;controls_AF_fin_male=3.00000e-01;controls_nhomalt_fin_male=1;non_neuro_AC_nfe_nwe=13;non_neuro_AN_nfe_nwe=36;non_neuro_AF_nfe_nwe=3.61111e-01;non_neuro_nhomalt_nfe_nwe=4;AC_fin_male=18;AN_fin_male=34;AF_fin_male=5.29412e-01;nhomalt_fin_male=5;AC_nfe_female=5;AN_nfe_female=28;AF_nfe_female=1.78571e-01;nhomalt_nfe_female=0;AC_amr=6;AN_amr=8;AF_amr=7.50000e-01;nhomalt_amr=2;non_topmed_AC_nfe_male=12;non_topmed_AN_nfe_male=28;non_topmed_AF_nfe_male=4.28571e-01;non_topmed_nhomalt_nfe_male=4;AC_eas=2;AN_eas=2;AF_eas=1.00000e+00;nhomalt_eas=1;nhomalt=18;non_neuro_AC_nfe_female=5;non_neuro_AN_nfe_female=24;non_neuro_AF_nfe_female=2.08333e-01;non_neuro_nhomalt_nfe_female=0;non_neuro_AC_afr=1;non_neuro_AN_afr=10;non_neuro_AF_afr=1.00000e-01;non_neuro_nhomalt_afr=0;controls_AC_raw=1798;controls_AN_raw=5254;controls_AF_raw=3.42215e-01;controls_nhomalt_raw=765;controls_AC_male=18;controls_AN_male=38;controls_AF_male=4.73684e-01;controls_nhomalt_male=6;non_topmed_AC_male=41;non_topmed_AN_male=86;non_topmed_AF_male=4.76744e-01;non_topmed_nhomalt_male=12;controls_AC_nfe_female=5;controls_AN_nfe_female=16;controls_AF_nfe_female=3.12500e-01;controls_nhomalt_nfe_female=0;non_neuro_AC_amr=3;non_neuro_AN_amr=4;non_neuro_AF_amr=7.50000e-01;non_neuro_nhomalt_amr=1;non_neuro_AC_eas_female=0;non_neuro_AN_eas_female=0;non_neuro_nhomalt_eas_female=0;AC_asj_male=0;AN_asj_male=0;nhomalt_asj_male=0;controls_AC_nfe_male=12;controls_AN_nfe_male=18;controls_AF_nfe_male=6.66667e-01;controls_nhomalt_nfe_male=4;non_neuro_AC_fin=8;non_neuro_AN_fin=18;non_neuro_AF_fin=4.44444e-01;non_neuro_nhomalt_fin=2;AC_oth_female=3;AN_oth_female=6;AF_oth_female=5.00000e-01;nhomalt_oth_female=0;controls_AC_nfe=17;controls_AN_nfe=34;controls_AF_nfe=5.00000e-01;controls_nhomalt_nfe=4;controls_AC_oth_female=2;controls_AN_oth_female=4;controls_AF_oth_female=5.00000e-01;controls_nhomalt_oth_female=0;controls_AC_asj=1;controls_AN_asj=2;controls_AF_asj=5.00000e-01;controls_nhomalt_asj=0;non_neuro_AC_amr_male=0;non_neuro_AN_amr_male=0;non_neuro_nhomalt_amr_male=0;controls_AC_nfe_nwe=13;controls_AN_nfe_nwe=22;controls_AF_nfe_nwe=5.90909e-01;controls_nhomalt_nfe_nwe=4;AC_nfe_nwe=13;AN_nfe_nwe=42;AF_nfe_nwe=3.09524e-01;nhomalt_nfe_nwe=4;controls_AC_nfe_seu=0;controls_AN_nfe_seu=0;controls_nhomalt_nfe_seu=0;non_neuro_AC_amr_female=3;non_neuro_AN_amr_female=4;non_neuro_AF_amr_female=7.50000e-01;non_neuro_nhomalt_amr_female=1;non_neuro_AC_nfe_onf=2;non_neuro_AN_nfe_onf=8;non_neuro_AF_nfe_onf=2.50000e-01;non_neuro_nhomalt_nfe_onf=0;non_topmed_AC_eas_male=2;non_topmed_AN_eas_male=2;non_topmed_AF_eas_male=1.00000e+00;non_topmed_nhomalt_eas_male=1;controls_AC_amr_female=3;controls_AN_amr_female=4;controls_AF_amr_female=7.50000e-01;controls_nhomalt_amr_female=1;non_neuro_AC_fin_male=3;non_neuro_AN_fin_male=10;non_neuro_AF_fin_male=3.00000e-01;non_neuro_nhomalt_fin_male=1;AC_female=39;AN_female=88;AF_female=4.43182e-01;nhomalt_female=6;non_neuro_AC_oth_male=1;non_neuro_AN_oth_male=4;non_neuro_AF_oth_male=2.50000e-01;non_neuro_nhomalt_oth_male=0;non_topmed_AC_nfe_est=2;non_topmed_AN_nfe_est=4;non_topmed_AF_nfe_est=5.00000e-01;non_topmed_nhomalt_nfe_est=0;non_topmed_AC_nfe_nwe=13;non_topmed_AN_nfe_nwe=34;non_topmed_AF_nfe_nwe=3.82353e-01;non_topmed_nhomalt_nfe_nwe=4;non_topmed_AC_amr_male=3;non_topmed_AN_amr_male=4;non_topmed_AF_amr_male=7.50000e-01;non_topmed_nhomalt_amr_male=1;non_topmed_AC_nfe_onf=2;non_topmed_AN_nfe_onf=12;non_topmed_AF_nfe_onf=1.66667e-01;non_topmed_nhomalt_nfe_onf=0;controls_AC_eas_male=2;controls_AN_eas_male=2;controls_AF_eas_male=1.00000e+00;controls_nhomalt_eas_male=1;controls_AC_oth_male=1;controls_AN_oth_male=4;controls_AF_oth_male=2.50000e-01;controls_nhomalt_oth_male=0;non_topmed_AC=80;non_topmed_AN=170;non_topmed_AF=4.70588e-01;non_topmed_nhomalt=18;controls_AC_fin=8;controls_AN_fin=18;controls_AF_fin=4.44444e-01;controls_nhomalt_fin=2;non_neuro_AC_nfe=17;non_neuro_AN_nfe=48;non_neuro_AF_nfe=3.54167e-01;non_neuro_nhomalt_nfe=4;non_neuro_AC_fin_female=5;non_neuro_AN_fin_female=8;non_neuro_AF_fin_female=6.25000e-01;non_neuro_nhomalt_fin_female=1;non_topmed_AC_nfe_seu=0;non_topmed_AN_nfe_seu=4;non_topmed_AF_nfe_seu=0.00000e+00;non_topmed_nhomalt_nfe_seu=0;controls_AC_eas_female=0;controls_AN_eas_female=0;controls_nhomalt_eas_female=0;non_topmed_AC_asj=1;non_topmed_AN_asj=2;non_topmed_AF_asj=5.00000e-01;non_topmed_nhomalt_asj=0;controls_AC_nfe_onf=2;controls_AN_nfe_onf=8;controls_AF_nfe_onf=2.50000e-01;controls_nhomalt_nfe_onf=0;non_neuro_AC=35;non_neuro_AN=92;non_neuro_AF=3.80435e-01;non_neuro_nhomalt=8;non_topmed_AC_nfe=17;non_topmed_AN_nfe=54;non_topmed_AF_nfe=3.14815e-01;non_topmed_nhomalt_nfe=4;non_topmed_AC_raw=3996;non_topmed_AN_raw=12728;non_topmed_AF_raw=3.13953e-01;non_topmed_nhomalt_raw=1721;non_neuro_AC_nfe_est=2;non_neuro_AN_nfe_est=4;non_neuro_AF_nfe_est=5.00000e-01;non_neuro_nhomalt_nfe_est=0;non_topmed_AC_oth_male=4;non_topmed_AN_oth_male=8;non_topmed_AF_oth_male=5.00000e-01;non_topmed_nhomalt_oth_male=1;AC_nfe_est=2;AN_nfe_est=4;AF_nfe_est=5.00000e-01;nhomalt_nfe_est=0;non_topmed_AC_afr_male=2;non_topmed_AN_afr_male=10;non_topmed_AF_afr_male=2.00000e-01;non_topmed_nhomalt_afr_male=0;AC_eas_male=2;AN_eas_male=2;AF_eas_male=1.00000e+00;nhomalt_eas_male=1;controls_AC_eas=2;controls_AN_eas=2;controls_AF_eas=1.00000e+00;controls_nhomalt_eas=1;non_neuro_AC_eas_male=2;non_neuro_AN_eas_male=2;non_neuro_AF_eas_male=1.00000e+00;non_neuro_nhomalt_eas_male=1;non_neuro_AC_asj_male=0;non_neuro_AN_asj_male=0;non_neuro_nhomalt_asj_male=0;controls_AC_oth=3;controls_AN_oth=8;controls_AF_oth=3.75000e-01;controls_nhomalt_oth=0;AC_nfe=17;AN_nfe=62;AF_nfe=2.74194e-01;nhomalt_nfe=4;non_topmed_AC_female=39;non_topmed_AN_female=84;non_topmed_AF_female=4.64286e-01;non_topmed_nhomalt_female=6;non_neuro_AC_asj=1;non_neuro_AN_asj=2;non_neuro_AF_asj=5.00000e-01;non_neuro_nhomalt_asj=0;non_topmed_AC_eas_female=0;non_topmed_AN_eas_female=0;non_topmed_nhomalt_eas_female=0;non_neuro_AC_raw=3103;non_neuro_AN_raw=10138;non_neuro_AF_raw=3.06076e-01;non_neuro_nhomalt_raw=1328;non_topmed_AC_eas=2;non_topmed_AN_eas=2;non_topmed_AF_eas=1.00000e+00;non_topmed_nhomalt_eas=1;non_topmed_AC_fin_male=18;non_topmed_AN_fin_male=34;non_topmed_AF_fin_male=5.29412e-01;non_topmed_nhomalt_fin_male=5;AC_fin=45;AN_fin=78;AF_fin=5.76923e-01;nhomalt_fin=10;AC_nfe_male=12;AN_nfe_male=34;AF_nfe_male=3.52941e-01;nhomalt_nfe_male=4;controls_AC_amr_male=0;controls_AN_amr_male=0;controls_nhomalt_amr_male=0;controls_AC_afr_female=0;controls_AN_afr_female=0;controls_nhomalt_afr_female=0;controls_AC_amr=3;controls_AN_amr=4;controls_AF_amr=7.50000e-01;controls_nhomalt_amr=1;AC_asj_female=1;AN_asj_female=2;AF_asj_female=5.00000e-01;nhomalt_asj_female=0;non_neuro_AC_eas=2;non_neuro_AN_eas=2;non_neuro_AF_eas=1.00000e+00;non_neuro_nhomalt_eas=1;non_neuro_AC_male=19;non_neuro_AN_male=48;non_neuro_AF_male=3.95833e-01;non_neuro_nhomalt_male=6;AC_asj=1;AN_asj=2;AF_asj=5.00000e-01;nhomalt_asj=0;controls_AC_nfe_est=2;controls_AN_nfe_est=4;controls_AF_nfe_est=5.00000e-01;controls_nhomalt_nfe_est=0;non_topmed_AC_asj_female=1;non_topmed_AN_asj_female=2;non_topmed_AF_asj_female=5.00000e-01;non_topmed_nhomalt_asj_female=0;non_topmed_AC_oth=7;non_topmed_AN_oth=14;non_topmed_AF_oth=5.00000e-01;non_topmed_nhomalt_oth=1;non_topmed_AC_fin_female=27;non_topmed_AN_fin_female=44;non_topmed_AF_fin_female=6.13636e-01;non_topmed_nhomalt_fin_female=5;AC_oth=7;AN_oth=14;AF_oth=5.00000e-01;nhomalt_oth=1;non_neuro_AC_nfe_male=12;non_neuro_AN_nfe_male=24;non_neuro_AF_nfe_male=5.00000e-01;non_neuro_nhomalt_nfe_male=4;controls_AC_female=16;controls_AN_female=34;controls_AF_female=4.70588e-01;controls_nhomalt_female=2;non_topmed_AC_fin=45;non_topmed_AN_fin=78;non_topmed_AF_fin=5.76923e-01;non_topmed_nhomalt_fin=10;non_topmed_AC_nfe_female=5;non_topmed_AN_nfe_female=26;non_topmed_AF_nfe_female=1.92308e-01;non_topmed_nhomalt_nfe_female=0;controls_AC_asj_male=0;controls_AN_asj_male=0;controls_nhomalt_asj_male=0;non_topmed_AC_asj_male=0;non_topmed_AN_asj_male=0;non_topmed_nhomalt_asj_male=0;non_neuro_AC_oth=3;non_neuro_AN_oth=8;non_neuro_AF_oth=3.75000e-01;non_neuro_nhomalt_oth=0;AC_male=41;AN_male=92;AF_male=4.45652e-01;nhomalt_male=12;controls_AC_fin_female=5;controls_AN_fin_female=8;controls_AF_fin_female=6.25000e-01;controls_nhomalt_fin_female=1;controls_AC_asj_female=1;controls_AN_asj_female=2;controls_AF_asj_female=5.00000e-01;controls_nhomalt_asj_female=0;AC_amr_male=3;AN_amr_male=4;AF_amr_male=7.50000e-01;nhomalt_amr_male=1;AC_amr_female=3;AN_amr_female=4;AF_amr_female=7.50000e-01;nhomalt_amr_female=1;AC_oth_male=4;AN_oth_male=8;AF_oth_male=5.00000e-01;nhomalt_oth_male=1;non_neuro_AC_nfe_seu=0;non_neuro_AN_nfe_seu=0;non_neuro_nhomalt_nfe_seu=0;non_topmed_AC_afr_female=0;non_topmed_AN_afr_female=2;non_topmed_AF_afr_female=0.00000e+00;non_topmed_nhomalt_afr_female=0;non_topmed_AC_afr=2;non_topmed_AN_afr=12;non_topmed_AF_afr=1.66667e-01;non_topmed_nhomalt_afr=0;controls_AC=34;controls_AN=72;controls_AF=4.72222e-01;controls_nhomalt=8;non_neuro_AC_oth_female=2;non_neuro_AN_oth_female=4;non_neuro_AF_oth_female=5.00000e-01;non_neuro_nhomalt_oth_female=0;non_topmed_faf95_amr=3.26626e-01;non_topmed_faf99_amr=3.26626e-01;faf95_afr=2.53825e-02;faf99_afr=2.53821e-02;controls_faf95_afr=0.00000e+00;controls_faf99_afr=0.00000e+00;faf95_amr=3.26626e-01;faf99_amr=3.26626e-01;faf95_eas=1.77680e-01;faf99_eas=1.77680e-01;faf95=3.65988e-01;faf99=3.65988e-01;non_neuro_faf95_afr=5.12900e-03;non_neuro_faf99_afr=5.12900e-03;non_neuro_faf95_amr=2.04422e-01;non_neuro_faf99_amr=2.04423e-01;controls_faf95_nfe=3.18592e-01;controls_faf99_nfe=3.18591e-01;non_topmed_faf95=3.87517e-01;non_topmed_faf99=3.87517e-01;non_neuro_faf95_nfe=2.25669e-01;non_neuro_faf99_nfe=2.25670e-01;non_neuro_faf95=2.81191e-01;non_neuro_faf99=2.81191e-01;non_topmed_faf95_nfe=2.00594e-01;non_topmed_faf99_nfe=2.00595e-01;controls_faf95_eas=1.77680e-01;controls_faf99_eas=1.77680e-01;faf95_nfe=1.74711e-01;faf99_nfe=1.74712e-01;non_topmed_faf95_eas=1.77680e-01;non_topmed_faf99_eas=1.77680e-01;controls_faf95_amr=2.04422e-01;controls_faf99_amr=2.04423e-01;non_neuro_faf95_eas=1.77680e-01;non_neuro_faf99_eas=1.77680e-01;non_topmed_faf95_afr=2.96130e-02;non_topmed_faf99_afr=2.96133e-02;controls_faf95=3.47362e-01;controls_faf99=3.47362e-01;controls_popmax=eas;controls_AC_popmax=2;controls_AN_popmax=2;controls_AF_popmax=1.00000e+00;controls_nhomalt_popmax=1;popmax=eas;AC_popmax=2;AN_popmax=2;AF_popmax=1.00000e+00;nhomalt_popmax=1;age_hist_het_bin_freq=2|1|1|1|0|2|0|2|1|0;age_hist_het_n_smaller=4;age_hist_het_n_larger=0;age_hist_hom_bin_freq=3|0|0|0|0|0|2|0|1|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0;non_neuro_popmax=eas;non_neuro_AC_popmax=2;non_neuro_AN_popmax=2;non_neuro_AF_popmax=1.00000e+00;non_neuro_nhomalt_popmax=1;non_topmed_popmax=eas;non_topmed_AC_popmax=2;non_topmed_AN_popmax=2;non_topmed_AF_popmax=1.00000e+00;non_topmed_nhomalt_popmax=1;vep=C|intergenic_variant|MODIFIER|||||||||||||||rs374742143|1||||SNV|1||||||||||||||||||||||||||||||||||||||||||||

我的输出在Postgres表中:

as_id   as_nm      as_at_id   as_at_nm      vcf_id  variant_id  as_at_line_seq  date_start  date_end    as_at_val_seq   as_at_va_nm                         as_at_value
1       VCF         1           Variant     ""      1           1                                       1               CHROM                           22
1       VCF         1           Variant     ""      1           1                                       1               POS                             16050036
1       VCF         1           Variant     ""      1           1                                       1               ID                              rs374742143
1       VCF         1           Variant     ""      1           1                                       1               REF                             A
1       VCF         1           Variant     ""      1           1                                       1               ALT                             C
1       VCF         1           Variant     ""      1           1                                       1               QUAL                            442156.34
1       VCF         1           Variant     ""      1           1                                       1               FILTER                          RF
1       VCF         1           Variant     ""      1           1                                       1               INFO/non_neuro_nhomalt_amr_male     0
1       VCF         1           Variant     ""      1           1                                       1               INFO/controls_AN_afr    4
1       VCF         1           Variant     ""      1           1                                       1               INFO/controls_AC_amr_female     3
1       VCF         1           Variant     ""      1           1                                       1               INFO/AF_oth_female      0.5
1       VCF         1           Variant     ""      1           1                                       1               INFO/non_topmed_nhomalt_afr_female  0

我的脚本大约需要60分钟来处理源VCF文件中的100K行。大部分时间都花在了博士后上。你知道吗

如何优化插入逻辑以加快速度?你知道吗


Tags: anasmalefemaleacneuroeasaf
2条回答

我知道在SQL中,插入/更新100K行或对大型未索引表执行事件查询需要一些时间。你知道吗

我对VCF不是很了解,但为了加快您的流程,请考虑以下几点:

  1. 索引表。你知道吗
  2. 执行几个insert命令,然后才执行commit

我想如果用特殊的postgresql方法切换到pandas-to-sql方法,速度可以提高10~100倍。 我用来比较普通的insert和postgresql的copy方法。速度是10~100倍取决于你的数据和表格设计。你知道吗

下面是示例代码。 1使用sqlalchemy db引擎如下:

from sqlalchmey import create_engine

engine = create_engine('postgres://postgres:postgres@yourserver/dbname')
  1. 为支持复制自的数据库定义\u sql()方法的替代方法。熊猫在线手册:https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#io-sql

    from io import StringIO
    
    def psql_insert_copy(table, conn, keys, data_iter):
    
        # gets a DBAPI connection that can provide a cursor
        dbapi_conn = conn.connection
        with dbapi_conn.cursor() as cur:
        s_buf = StringIO()
    
        writer = csv.writer(s_buf)
        writer.writerows(data_iter)
        s_buf.seek(0)
    
        columns = ', '.join('"{}"'.format(k) for k in keys)
        if table.schema:
            table_name = '{}.{}'.format(table.schema, table.name)
        else:
            table_name = table.name
    
        sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
                              table_name, columns)
        cur.copy_expert(sql=sql, file=s_buf)
    
  2. 将数据转换为数据帧格式。 假设准备好的数据在df1中

  3. 通过sql将数据发送到数据库

    df1.tosql('yourtablename',engine=engin,method=psql_insert_copy) 
    

在我的例子中,在2核/16Gmem docker容器中至少可以达到每秒10~20K行的插入速度。你知道吗

祝你好运。 如果你发现答案有用,请投赞成票。你知道吗

相关问题 更多 >