优化大型.csv fi的搜索

2024-05-16 21:54:38 发布

您现在位置:Python中文网/ 问答频道 /正文

当一个csv文件被一个新的.m文件切割后,我将不允许用一个新的csv.m来写一个新的.m文件,然后用一个csv文件来切割它们。这很有魅力,但是。。。在

然后我需要把这些字符串中的每个字符串都提取出来,然后搜索另一个.csv文件(大约有725000个条目),看看这些字符串是否列在大文件中。如果是,则将它们写入一个单独的文件中。我已经成功地做到了这一点(见下面的代码),但它是超级慢。。。我将大文件从72.5万条减少到大约2000条,这花了15秒(意味着整个文件大约90分钟)。这太慢了!如何减少计算时间?在

import csv
import re
import time

# Input should be a .csv file with 2 columns (PrEST ID, PrEST Sequence)
in_file = open('Tryptic Sequences Input.csv','r')
in_file1 = open('Reference Peptides (ENSG, Björn) TEST.csv','r')
out_file = open('Tryptic Sequences Output.csv','w+')
out_file1 = open('Tryptic Sequences Output (non-unique peptides).csv','w+')

# Reader/Writer iterables
reader = csv.reader(in_file)
reader1 = csv.reader(in_file1)
in_list = list(reader)
in_list1 = list(reader1)
writer = csv.writer(out_file)
writer1 = csv.writer(out_file1)
headers = ('PrEST','Peptide')
writer.writerow(headers)
writer1.writerow(headers)

# Initiate variables
Peptide_list = [] # List for Peptides (resets for each PrEST)
ID_list = [] # List for PrEST IDs (resets for each PrEST)
Copy_list = [] # List for non-unique tryptic peptides
Copy_ID_list = []
Peptide = '' # Current peptide (no missed cleavages)
Peptide_MC1 = '' # Current peptide with 1 missed cleavage
Peptide_MC2 = '' # Current peptide with 2 missed cleavages
MC1 = 'N'
MC2 = 'N'
Unique = 'Y'

t0 = time.clock()

# ------ Main PrEST for-loop -------
for row in range(len(in_list)): # For every PrEST (row)
    First = 'Y'
    PrEST_seq = in_list[row][1]

    # -------- Main AA-reader for-loop --------
    for n in range(len(PrEST_seq)): # For every AA in every PrEST

        if ((PrEST_seq[n:n+1] == 'R' or
             PrEST_seq[n:n+1] == 'K') and
             PrEST_seq[n+1:n+2] != 'P'):
            if First != 'Y': # Does not count first peptide + MCs (part of ABP)
                Peptide += PrEST_seq[n:n+1]
                if len(Peptide) >= 6: # Only appends peptide if longer than 6 AA

 # KEY PART ---------------------------------------------------------------------

                    # Searches for non-unique peptides from in_file1
                    for line in range(len(in_list1)):
                        if re.search(r"\b(?=\w)%s\b(?!\w)" % Peptide,in_list1[line][2]) != None:
                            Unique = 'N'
                            Copy_ID_list.append(in_list[row][0])
                            Copy_list.append(Peptide)
                            break
                    if Unique == 'Y':
                        ID_list.append(in_list[row][0])
                        Peptide_list.append(Peptide)

 # (repeated twice below) --------------------------------------------------------

                Unique = 'Y' # Resets variable

                # -------- One missed cleavage while-loop --------
                Peptide_MC1 = Peptide
                m = n
                while MC1 == 'N' and m+1 <= len(PrEST_seq):
                    m += 1
                    if ((PrEST_seq[m:m+1] == 'R' or
                         PrEST_seq[m:m+1] == 'K') and
                         PrEST_seq[m+1:m+2] != 'P'):
                        Peptide_MC1 += PrEST_seq[m:m+1]
                        if len(Peptide_MC1) >= 6:

                            for line in range(len(in_list1)):
                                if re.search(r"\b(?=\w)%s\b(?!\w)" % Peptide_MC1,in_list1[line][2]) != None:
                                    Unique = 'N'
                                    Copy_ID_list.append(in_list[row][0])
                                    Copy_list.append(Peptide_MC1)
                                    break
                            if Unique == 'Y':
                                ID_list.append(in_list[row][0])
                                Peptide_list.append(Peptide_MC1)
                        Unique = 'Y'
                        MC1 = 'Y'
                    else:
                        Peptide_MC1 += PrEST_seq[m:m+1]
                    # ------------- End MC1 while-loop ------------

                # -------- Two missed cleavages while-loop --------
                Peptide_MC2 = Peptide_MC1
                k = m
                while MC2 == 'N' and k+1 <= len(PrEST_seq):
                    k += 1
                    if ((PrEST_seq[k:k+1] == 'R' or
                         PrEST_seq[k:k+1] == 'K') and
                         PrEST_seq[k+1:k+2] != 'P'):
                        Peptide_MC2 += PrEST_seq[k:k+1]
                        if len(Peptide_MC2) >= 6:

                            for line in range(len(in_list1)):
                                if re.search(r"\b(?=\w)%s\b(?!\w)" % Peptide_MC2,in_list1[line][2]) != None:
                                    Unique = 'N'
                                    Copy_ID_list.append(in_list[row][0])
                                    Copy_list.append(Peptide_MC2)
                                    break
                            if Unique == 'Y':
                                ID_list.append(in_list[row][0])
                                Peptide_list.append(Peptide_MC2)
                        Unique = 'Y'
                        MC2 = 'Y'
                    else:
                        Peptide_MC2 += PrEST_seq[k:k+1]
                    # ------------ End MC2 while-loop -------------

                # Resets variables
                Peptide = ''
                Peptide_MC1 = ''
                Peptide_MC2 = ''
                MC1 = 'N'
                MC2 = 'N'
            elif First == 'Y': # Doesn't count first cleavage (contains ABP)
                Peptide = ''
                First = 'N'
        else: # Non-cleavable AAs - Peptide grows
            Peptide += PrEST_seq[n:n+1]

        # ------- End main AA-reader for-loop --------

    Peptide_list.sort(key=len, reverse=True) # Sorts list by length
    for j in range(len(Peptide_list)): # Writes current PrEST to file
        Collected_list = (ID_list[j],Peptide_list[j])
        writer.writerow(Collected_list)
    Peptide_list = []
    ID_list = []

    Copy_list.sort(key=len, reverse=True)
    for j in range(len(Copy_list)):
        Collected_list = (Copy_ID_list[j],Copy_list[j])
        writer1.writerow(Collected_list)
    Copy_list = []
    Copy_ID_list = []

    # ----- End main PrEST-reader for-loop -----      
print('------- Finished -------')
print('Total time',time.clock()-t0,'seconds')
out_file.close()
out_file1.close()

一般来说,我对Python和编程比较陌生,而且我非常确定我的代码在很多方面都缺乏。如果我不包括对大的.csv文件的搜索,这是相当快的,但我有点需要这部分。我不知道它是否可以在搜索部分更快,或者在其他地方也可以更快。在


Tags: 文件csvinidforlenifseq