使用python折叠外显子

exon_to_gene_dict = {} with open('gene_to_exon', 'r') as file_gene, open('exon.fasta', 'r') as file_exons: for line in file_gene: line = line.rstrip().split() scaffold = line[0] gene = line[1] exon_to_gene_dict.setdefault(gene, []).append(scaffold)

exon_sequence_dict = {} with open(exons.fasta,'r') as file_exons: for line in file_exons: line = line.rstrip() if line.startswith('>'): exon_id = line[1:] else: sequence = line exon_sequence_dict[exon_id] = sequence gene_to_cds_dict = {} for exon, seq in exon_sequence_dict.items(): for gene_id, exon1 in gene_to_exon_dict.items(): if exon in exon1: gene_to_cds_dict.setdefault(gene_id, []).append(seq) gene_id_to_sequence =[] for gene_id, sequence in sorted(gene_to_cds_dict.items()): print(gene_id, file=f_out) print(''.join(sequence), file=f_out)

2条回答

网友

1楼 · 编辑于 2024-05-29 05:10:26

你已经完成了第一部分，现在你需要创建一个字典，把外显子和序列联系起来。如何进行取决于FASTA文件的结构。如果FASTA文件在多行上有序列（例如，最大行长度为40个字符），则需要找到一种方法将它们连接到序列字典中。否则，如果所有的核苷酸都在一条线上，你的时间会更轻松。您应该可以先按“>；”拆分，然后按换行符拆分。在

with open('exon.fasta','r') as infile:
    exon_text = infile.read()


exon_text = exon_text.split(">")[1:]
exon_text = [x.split("\n")[0:2] for x in exon_text]

exonDict = {}

for exon in text:
    exonDict[exon[0]] = exon[1]

现在您将拥有第二个字典，以exon名称为键，序列作为条目。有了这两个部分，您应该能够迭代和连接所有的序列。在

网友

2楼 · 编辑于 2024-05-29 05:10:26

这是一个完整的解决方案，使用BioPython。BioPython会自动处理the issue that Cory Weller raised（如果FASTA有多行分割的序列）。这个解决方案还将来自同一基因的序列串联成CDS。在

from Bio import SeqIO
from Bio import Seq
from collections import defaultdict

with open('gene_to_exon.txt') as f:
    lines = f.read().splitlines()
    exon_to_gene = dict(tuple(line.split()) for line in lines)

exon_records = SeqIO.to_dict(SeqIO.parse('exon.fasta', 'fasta'))

gene_records = defaultdict(list)
for exon_id, record in exon_records.items():
    gene_id = exon_to_gene[exon_id]
    gene_records[gene_id].append(record)

cds_records = []
for gene_id in gene_records:
    gene_record = gene_records[gene_id][0]
    if len(gene_records[gene_id]) > 1:
        sequence = ''.join([str(gene_record.seq) for gene_record in gene_records[gene_id]])
        gene_record.seq = Seq.Seq(sequence)
    gene_record.id = gene_id
    gene_record.description = gene_id # see https://www.biostars.org/p/156428/
    cds_records.append(gene_record)


with open('output.fasta', 'w') as g:
    SeqIO.write(cds_records, g, 'fasta')

在您的输入文件中，将得到以下结果：

^{pr2}$

编辑：这里也是一个纯Python实现。FASTA解析代码来自于this code monk blog post（并适用于python3），作者在这里详细描述了使用itertools.groupby而不是使用for循环的优势。在

import itertools
from collections import defaultdict

def isheader(line):
    return line.startswith('>')

def aspairs(filehandle):
  for header,group in itertools.groupby(filehandle, isheader):
    if header:
      line = next(group)
      ensembl_id = line[1:].split()[0]
    else:
      sequence = ''.join(line.strip() for line in group)
      yield ensembl_id, sequence

with open('exon.fasta') as f:
    exon_records = dict(aspairs(f))

with open('gene_to_exon.txt') as f:
    lines = f.read().splitlines()
    exon_to_gene = dict(tuple(line.split()) for line in lines)

gene_records = defaultdict(str)
for exon_id, seq in exon_records.items():
    gene_id = exon_to_gene[exon_id]
    gene_records[gene_id] += seq

with open('output.fasta', 'w') as f:
    for gene_id, seq in gene_records.items():
        f.write('>{}\n{}\n'.format(gene_id, seq))

相关问题更多 >

编程相关推荐

热门问题

热门文章