优化大型.csv fi的搜索

2024-05-16 21:54:38 发布

男 | 程序猿一只，喜欢编程写python代码。

当一个csv文件被一个新的.m文件切割后，我将不允许用一个新的csv.m来写一个新的.m文件，然后用一个csv文件来切割它们。这很有魅力，但是。。。在

然后我需要把这些字符串中的每个字符串都提取出来，然后搜索另一个.csv文件（大约有725000个条目），看看这些字符串是否列在大文件中。如果是，则将它们写入一个单独的文件中。我已经成功地做到了这一点（见下面的代码），但它是超级慢。。。我将大文件从72.5万条减少到大约2000条，这花了15秒（意味着整个文件大约90分钟）。这太慢了！如何减少计算时间？在

import csv
import re
import time

# Input should be a .csv file with 2 columns (PrEST ID, PrEST Sequence)
in_file = open('Tryptic Sequences Input.csv','r')
in_file1 = open('Reference Peptides (ENSG, Björn) TEST.csv','r')
out_file = open('Tryptic Sequences Output.csv','w+')
out_file1 = open('Tryptic Sequences Output (non-unique peptides).csv','w+')

# Reader/Writer iterables
reader = csv.reader(in_file)
reader1 = csv.reader(in_file1)
in_list = list(reader)
in_list1 = list(reader1)
writer = csv.writer(out_file)
writer1 = csv.writer(out_file1)
headers = ('PrEST','Peptide')
writer.writerow(headers)
writer1.writerow(headers)

# Initiate variables
Peptide_list = [] # List for Peptides (resets for each PrEST)
ID_list = [] # List for PrEST IDs (resets for each PrEST)
Copy_list = [] # List for non-unique tryptic peptides
Copy_ID_list = []
Peptide = '' # Current peptide (no missed cleavages)
Peptide_MC1 = '' # Current peptide with 1 missed cleavage
Peptide_MC2 = '' # Current peptide with 2 missed cleavages
MC1 = 'N'
MC2 = 'N'
Unique = 'Y'

t0 = time.clock()

# ------ Main PrEST for-loop -------
for row in range(len(in_list)): # For every PrEST (row)
    First = 'Y'
    PrEST_seq = in_list[row][1]

    # -------- Main AA-reader for-loop --------
    for n in range(len(PrEST_seq)): # For every AA in every PrEST

        if ((PrEST_seq[n:n+1] == 'R' or
             PrEST_seq[n:n+1] == 'K') and
             PrEST_seq[n+1:n+2] != 'P'):
            if First != 'Y': # Does not count first peptide + MCs (part of ABP)
                Peptide += PrEST_seq[n:n+1]
                if len(Peptide) >= 6: # Only appends peptide if longer than 6 AA

 # KEY PART ---------------------------------------------------------------------

                    # Searches for non-unique peptides from in_file1
                    for line in range(len(in_list1)):
                        if re.search(r"\b(?=\w)%s\b(?!\w)" % Peptide,in_list1[line][2]) != None:
                            Unique = 'N'
                            Copy_ID_list.append(in_list[row][0])
                            Copy_list.append(Peptide)
                            break
                    if Unique == 'Y':
                        ID_list.append(in_list[row][0])
                        Peptide_list.append(Peptide)

 # (repeated twice below) --------------------------------------------------------

                Unique = 'Y' # Resets variable

                # -------- One missed cleavage while-loop --------
                Peptide_MC1 = Peptide
                m = n
                while MC1 == 'N' and m+1 <= len(PrEST_seq):
                    m += 1
                    if ((PrEST_seq[m:m+1] == 'R' or
                         PrEST_seq[m:m+1] == 'K') and
                         PrEST_seq[m+1:m+2] != 'P'):
                        Peptide_MC1 += PrEST_seq[m:m+1]
                        if len(Peptide_MC1) >= 6:

                            for line in range(len(in_list1)):
                                if re.search(r"\b(?=\w)%s\b(?!\w)" % Peptide_MC1,in_list1[line][2]) != None:
                                    Unique = 'N'
                                    Copy_ID_list.append(in_list[row][0])
                                    Copy_list.append(Peptide_MC1)
                                    break
                            if Unique == 'Y':
                                ID_list.append(in_list[row][0])
                                Peptide_list.append(Peptide_MC1)
                        Unique = 'Y'
                        MC1 = 'Y'
                    else:
                        Peptide_MC1 += PrEST_seq[m:m+1]
                    # ------------- End MC1 while-loop ------------

                # -------- Two missed cleavages while-loop --------
                Peptide_MC2 = Peptide_MC1
                k = m
                while MC2 == 'N' and k+1 <= len(PrEST_seq):
                    k += 1
                    if ((PrEST_seq[k:k+1] == 'R' or
                         PrEST_seq[k:k+1] == 'K') and
                         PrEST_seq[k+1:k+2] != 'P'):
                        Peptide_MC2 += PrEST_seq[k:k+1]
                        if len(Peptide_MC2) >= 6:

                            for line in range(len(in_list1)):
                                if re.search(r"\b(?=\w)%s\b(?!\w)" % Peptide_MC2,in_list1[line][2]) != None:
                                    Unique = 'N'
                                    Copy_ID_list.append(in_list[row][0])
                                    Copy_list.append(Peptide_MC2)
                                    break
                            if Unique == 'Y':
                                ID_list.append(in_list[row][0])
                                Peptide_list.append(Peptide_MC2)
                        Unique = 'Y'
                        MC2 = 'Y'
                    else:
                        Peptide_MC2 += PrEST_seq[k:k+1]
                    # ------------ End MC2 while-loop -------------

                # Resets variables
                Peptide = ''
                Peptide_MC1 = ''
                Peptide_MC2 = ''
                MC1 = 'N'
                MC2 = 'N'
            elif First == 'Y': # Doesn't count first cleavage (contains ABP)
                Peptide = ''
                First = 'N'
        else: # Non-cleavable AAs - Peptide grows
            Peptide += PrEST_seq[n:n+1]

        # ------- End main AA-reader for-loop --------

    Peptide_list.sort(key=len, reverse=True) # Sorts list by length
    for j in range(len(Peptide_list)): # Writes current PrEST to file
        Collected_list = (ID_list[j],Peptide_list[j])
        writer.writerow(Collected_list)
    Peptide_list = []
    ID_list = []

    Copy_list.sort(key=len, reverse=True)
    for j in range(len(Copy_list)):
        Collected_list = (Copy_ID_list[j],Copy_list[j])
        writer1.writerow(Collected_list)
    Copy_list = []
    Copy_ID_list = []

    # ----- End main PrEST-reader for-loop -----      
print('------- Finished -------')
print('Total time',time.clock()-t0,'seconds')
out_file.close()
out_file1.close()

一般来说，我对Python和编程比较陌生，而且我非常确定我的代码在很多方面都缺乏。如果我不包括对大的.csv文件的搜索，这是相当快的，但我有点需要这部分。我不知道它是否可以在搜索部分更快，或者在其他地方也可以更快。在

Tags：文件 csv in id for len if seq

0条回答

目前没有回答

优化大型.csv fi的搜索

相关问题更多 >

编程相关推荐

热门问题

热门文章

优化大型.csv fi的搜索

相关问题 更多 >

编程相关推荐

热门问题

热门文章

相关问题更多 >