使用Python添加代码以找到发现的开放阅读框序列

2024-04-25 05:15:08 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在一个文件中处理多个FASTA条目。 我有一个python脚本,可以找到所有打开的阅读框架。排除小于30bp的orf,寻找起始/终止密码子bp的位置。 现在我需要转移程序来查找/附加??特定的序列,以便我可以将orf序列传输到一个新的FASTA.file

    #FUNCTION START
def orfFINDER(dna,frame):

    stop_codons = ['tga', 'tag', 'taa']
    start_codon = ['atg']
    start_positions = []
    stop_positions = []
    num_starts=0
    num_stops=0

    for i in range(frame,len(dna),3):
        codon=dna[i:i+3].lower()
        if codon in start_codon:
            start_positions += str(i+1).splitlines()
        if codon in stop_codons:
            stop_positions += str(i+1).splitlines()

    for line in stop_positions:
        num_stops += 1

    for line in start_positions:
        num_starts += 1

    orffound = {}

    if num_stops >=1 and num_starts >=1: #first statment: the number of stop codons and start condos are greater than or equal to 1;

        orfs = True
        stop_before = 0
        start_before = 0

        if num_starts > num_stops:
            num_runs = num_starts
        if num_stops > num_starts:
            num_runs = num_stops
        if num_starts == num_stops:
            num_runs = num_starts

        position_stop_previous = 0
        position_start_previous = 0
        counter = 0

        for position_stop in stop_positions:
            position_stop = int(position_stop.rstrip()) + 2

            for position_start in start_positions:
                position_start = position_start.rstrip()

                if int(position_start) < int(position_stop) and int(position_stop) > int(position_stop_previous) and int(position_start) > int(position_stop_previous):

                    counter += 1
                    nameorf = "orf"+str(counter)
                    position_stop_previous += int(position_stop) - int(position_stop_previous)
                    position_start_previous += int(position_start) - int(position_start_previous)
                    sizeorf = int(position_stop) - int(position_start) + 1
                    orffound[nameorf] = position_start,position_stop,sizeorf,frame
                else:
                    pass
    else:  
        orfs = False
    return orffound

#READ FASTA FILE AND SAVE HEADERS AND SEQUENCES IN A DICTIONARY
seqs={}

for line in infile:
    line = line.rstrip()
    if line[0] == '>':
        words=line.split() 
        name=words[0][1:]
        seqs[name]=''
    else:
        seqs[name] = seqs[name] + line

#DEFINE FRAME TO FIND ORF
#if frame = 0, start from the first position in the sequence
frame=0

#EXECUTE THE ORFFINDER FUNCTION
for i in seqs.items():
    header= i[0]
    seq = i[1]
    orf = orfFINDER(seq,frame)

    for i in orf.items():
        numorf=i[0]
        startorf=orf[numorf][0]
        stoporf=orf[numorf][1]
        lengthorf=orf[numorf][2]
        frameorf=orf[numorf][3]
        if int(lengthorf) > 30:
            pass
            print(header,numorf,"start",startorf,"stop",stoporf,"length",lengthorf,"frame",frameorf)
infile.close()

Tags: inforiflinepositionframestartnum