匹配文本和替换的更好方法

GENE1_ID GENE2_ID SNP1_ID SNP2_ID Drug TRIML1 D4S234E rs1 rs2 xyz TRIML2 D4S234E rs1 rs2 xyz TRIML1 rs8 rs1 rs8 abc TRIML2 rs8 rs1 rs8 abc D4S234E ACCN5 rs2 rs4 xyz D4S234E CTSO rs2 rs4 xyz D4S234E ODZ3 rs2 rs5 abc1 ODZ3 rs7 rs5 rs7 abc2 TRIML1 ODZ3 rs6 rs5 xyz1

snp_gene_dict = {} with open('File1') as f1: for line in f1: snp_key = line.split()[0] vals = line.split()[1] gene_val = vals.split(',') snp_gene_dict[snp_key] = gene_val col0 = [] col1 = [] snp_first_col = [] snp_second_col = [] with open('File2') as f2: for line in f2: snp0, snp1 = line.split() col0.append(snp0) col1.append(snp1) for i in range(len(col0)): if col0[i] in snp_gene_dict.keys(): snp_first_col.append(snp_gene_dict[col0[i]]) else: snp_first_col.append([col0[i]]) for i in range(len(col1)): if col1[i] in snp_gene_dict.keys(): snp_second_col.append(snp_gene_dict[col1[i]]) else: snp_second_col.append([col1[i]]) with open('output-gene-gene', 'w') as out: for i,j in map(None,snp_first_col,snp_second_col): if len(i) == 1 and len(j) == 1: out.write ('{a}\t{b} \n'.format(a = '\t'.join(i), b = '\t'.join(j))) elif len(i) > 1 and len(j) == 1: for item in i: out.write ('{a}\t{b} \n'.format(a = item, b = '\t'.join(j))) elif len(j) > 1 and len(i) == 1: for item in j: out.write ('{a}\t{b} \n'.format(a = '\t'.join(i), b= item)) elif len(i) > 1 and len(j) > 1: for elem1 in i: for elem2 in j: out.write('{a}\t{b} \n'.format(a = elem1, b = elem2))

1条回答

网友

1楼 · 发布于 2024-06-02 09:07:42

这里有一种使用SQLITE的方法，概念非常简单。只需将FILE1插入数据库，然后从中读取。你知道吗

import logging
INSERT_SPN_STATEMENT = 'INSERT INTO spn_table (spn_id, gene_id) VALUES (?, ?)'
SELECT_SPN_BY_ID_STATEMENT='SELECT ID FROM spn_table WHERE spn_id=? and GENE_ID=?'
def dump_file_to_db(File1, connection):
    cursor = connection.cursor()
    for l in File1:
        #line looks like:  rs1 TRIML1,TRIML2, split will split on spaces and since we have 2 we can save it in 2 vars
        SPN_ID, GENE_ID = l.split()
        for g in GENE_ID.split(','):#now for each gene (comma separated) insert into the spn_table
            cursor.execute(SELECT_SPN_BY_ID_STATEMENT , (SPN_ID, g))
            if cursor.fetchone(): continue # record exists
            cursor.execute(INSERT_SPN_STATEMENT , (SPN_ID, g))
    connection.commit()

SELECT_SPN_STATEMENT = 'SELECT ID, spn_id, gene_id FROM spn_table WHERE spn_id=?'
def read_file(File2, connection):
    cursor = connection.cursor()
    for l in File2:
        spn1, spn2, drug = l.split()
        #get spn1 from database
        cursor.execute(SELECT_SPN_STATEMENT , (spn1,))
        _id, spn1_id, gene_id = cursor.fetchone()
        cursor.execute(SELECT_SPN_STATEMENT , (spn2,))
        _id, spn2_id, gene_id = cursor.fetchone()
        logging.info("%s %s %s %s", spn1_id, spn2_id, gene_id, drug)


def initialize_db():
    conn = sqlite3.connect('test.db');c = conn.cursor()
    # Create table
    c.execute('''CREATE TABLE IF NOT EXISTS spn_table
             (Id INTEGER PRIMARY KEY, spn_id text, gene_id text)''')
    return conn

import sqlite3
connection = initialize_db()
logging.basicConfig(level=logging.DEBUG)
logging.info("Started")
with open('File1.txt') as File1:
    dump_file_to_db(File1, connection)
with open('File2.txt') as File2:
    read_file(File2, connection)
logging.info("Done")

相关问题更多 >

编程相关推荐

热门问题

热门文章