使用python删除循环中的冗余

contig protein start end con1 P1 140 602 con1 P2 140 602 con1 P3 232 548 con2 P4 335 801 con2 P5 642 732 con2 P6 335 779 con2 P7 729 812 con3 P8 17 348 con3 P9 16 348

from itertools import groupby def non_homolog(hits): nonhomolog=[] overst = False for i in range(1,len(hits)): (p, c) = hits[i-1], hits[i] if p[2] <= c[2] and c[3] <= p[3]: if not overst: nonhomolog.append(c) nonhomolog.append(c) overst = True return nonhomolog fh = open('example.txt') oh = open('nonhomologs.txt', 'w') for qid, grp in groupby(fh, lambda l: l.split()[0]): hits = [] for line in grp: hsp = line.split() hsp[2], hsp[3] = int(hsp[2]), int(hsp[3]) hits.append(hsp) hits.sort(key=lambda x: x[2]) if non_homolog(hits): for hit in hits: oh.write('\t'.join([str(f) for f in hit])+'\n')

1条回答

网友

1楼 · 发布于 2024-05-29 01:41:34

试试这个尺码：

# this code assumes Python 2.7
from itertools import groupby, izip
from operator import attrgetter

INPUT    = "file.txt"
HOMO_YES = "homologs.txt"
HOMO_NO  = "nonhomologs.txt"
MAX_DIFF = 5

class Row:
    __slots__ = ["line", "con", "protein", "start", "end"]

    def __init__(self, s):
        self.line    = s.rstrip()
        data         = s.split()
        self.con     = data[0]
        self.protein = data[1]
        self.start   = int(data[2])
        self.end     = int(data[3])

    def __str__(self):
        return self.line

def count_homologs(items, max_diff=MAX_DIFF):
    num_items  = len(items)
    counts     = [0] * num_items
    # first item
    for i, item_i in enumerate(items):
        max_start = item_i.start + max_diff
        max_end   = item_i.end   + max_diff
        # second item
        for j in xrange(i+1, num_items):
            item_j = items[j]
            if item_j.start > max_start:
                break
            elif item_j.end <= max_end:
                counts[i] += 1
                counts[j] += 1
    return counts

def main():
    with open(INPUT) as inf, open(HOMO_YES, "w") as outhomo, open(HOMO_NO, "w") as outnothomo:
        # skip header
        next(inf, '')
        rows = (Row(line) for line in inf)

        for con, item_iter in groupby(rows, key=attrgetter("con")):
            # per-con list of Rows sorted by start,end
            items = sorted(item_iter, key=attrgetter("start", "end"))
            # get #homologs for each item
            counts = count_homologs(items)
            # do output
            for c,item in izip(counts, items):
                if c:
                    outhomo.write(str(item) + "\n")
                else:
                    outnothomo.write(str(item) + "\n")

if __name__=="__main__":
    main()

根据给定的数据，生成：

===同系物.txt===

con1    P1  140 602
con1    P2  140 602
con3    P9  16  348
con3    P8  17  348

===非同系物.txt===

con1    P3  232 548
con2    P6  335 779
con2    P4  335 801
con2    P5  642 732
con2    P7  729 812

相关问题更多 >

编程相关推荐

热门问题

热门文章