如果第1列匹配，则具有第11列值的提取行位于第二个文件的第2列和第3列之间

chr5 20311169 20311244 5 20311177 20311251 K00230:40:HNWJLBBXX:4:1101:1002:35936 255 + - 20311210.00 chr5 26610220 26610295 5 26610221 26610296 K00230:40:HNWJLBBXX:4:1101:1022:24155 255 + - 26610258.00

chr5 20311200 20311220 Nucleosome:1 110 5.0 39.9 MainPeak 1.43492858 0.68583064 chr5 801 861 Nucleosome:2 70 1.0 5.4 MainPeak 0.17076187 0.806538035 chr5 1021 1091 Nucleosome:3 80 2.0 14.4 MainPeak 0.42430331 0.481579895 chr5 1181 1251 Nucleosome:4 80 1.0 7.5 MainPeak 0.1362587 0.32626102999999995 chr5 1361 1441 Nucleosome:5 90 2.0 14.7 MainPeak 0.34212933 0.291726595 chr5 1621 1801 Nucleosome:6 190 2.0 26.1 MainPeak:doublet 0.37546564 0.353192625 chr5 2011 2071 Nucleosome:7 70 1.0 5.7 MainPeak 0.15091517 0.396369735 chr5 2161 2331 Nucleosome:8 180 1.0 17.2 MainPeak 0.08865312 0.42133046500000004 chr5 2441 2561 Nucleosome:9 130 2.5 25.3 MainPeak 0.7368501 0.48843276 chr5 2781 2851 Nucleosome:10 80 3.0 17.5 MainPeak 0.80818501 1.303005 chr5 3271 3431 Nucleosome:11 170 3.0 34.5 MainPeak+Shoulder 0.72967697 1.348257495 chr5 3521 3571 Nucleosome:12 60 1.0 5.8 MainPeak 0.1880739 0.504429705 chr5 3641 3791 Nucleosome:13 160 1.0 12.5 MainPeak:doublet 0.10098579 0.363148215

1条回答

网友

1楼 · 发布于 2024-04-19 23:53:37

一个从文本中提取第n列的简单函数可以使这一点相当直接。我假设当你说“第11列”时，你的意思是11列从1开始计数，而不是index-11列，其中第一项是index-0

伪代码：

Until there's no data left ~
    Read line1 from file1
    Read line2 from file2
    Extract Col11 from line1 as a real number
    Extract Col2 & Col3 from line2 as real numbers
    IF Col11 is within Col2 & Col3
        do something

Python代码：

import sys

# Given a space-separated row of data, return the Nth column as a real number
def getNthColumn(row, N):
    # Single-space the row, removing tabs, double-spaces etc.
    row = ' '.join(row.split())
    fields = row.split(' ')
    result = float(fields[N-1])   # fields are numbered 0->(N-1)
    #print("Returning column %d from [%s] -> %f" % (N, row, result))
    return result

if (len(sys.argv) == 3):
    fin1 = open(sys.argv[1], "rt")
    fin2 = open(sys.argv[2], "rt")  #TODO - handle file-not-found errors, etc.

    line1 = fin1.readline()
    line2 = fin2.readline()
    while (line1 != "" and line2 != ""):
        # Get the columns from the two lines
        f1_col11 = getNthColumn(line1, 11)
        f2_col2  = getNthColumn(line2,  2)
        f2_col3  = getNthColumn(line2,  3)  ### TODO handle errors
        # work out if it's a keeper
        # print("Is %f >= %f and %f <= %f" % (f1_col11, f2_col2, f1_col11, f2_col3))
        if (f1_col11 >= f2_col2 and f1_col11 <= f2_col3):
            print("MATCH: "+line1)
        else:
            print("NO-MATCH: "+line1)
        # Next rows
        line1 = fin1.readline()
        line2 = fin2.readline()
else:
    print("Give 2 files as arguments")

<> P> >坦诚地说，如果速度真的很关键，最好用编译语言编写，如：C/C++ + Pascal等。

编辑：测试和工作，添加了一些调试print（）

EDIT2:根据文件2中的所有行搜索文件1行

import sys

# Hold all the file2 Columns
file2_col23 = []

# Given a space-separated row of data, return the Nth column as a real number
def getNthColumn(row, N):
    # Single-space the row, removing tabs, double-spaces etc.
    row = ' '.join(row.split())
    fields = row.split(' ')
    try:
        result = float(fields[N-1])   # fields are numbered 0->(N-1)
    except:
        sys.stderr.write("Failed to fetch number column %d from [%s]" % (N, row))
        sys.exit(1)
    #print("Returning column %d from [%s] -> %f" % (N, row, result))
    return result

if (len(sys.argv) == 3):
    fin1 = open(sys.argv[1], "rt")
    fin2 = open(sys.argv[2], "rt")  #TODO - handle file-not-found errors, etc.

    # Load in the whole of file2, but just the column2 & column3
    # note the minimum col2 and maximum c3
    line2 = fin2.readline()
    min_c2 = None
    max_c3 = None
    while (line2 != ""):
        col2 = getNthColumn(line2, 2)
        col3 = getNthColumn(line2, 3)
        file2_col23.append( ( col2, col3 ) )
        # Note the min c2 and max c3 so we can quickly know if a search can
        # possible produce a result
        if (min_c2 == None or col2 < min_c2):
            min_c2 = col2
        if (max_c3 == None or col3 > max_c3):
            max_c3 = col3
        # next line
        line2 = fin2.readline().strip()

    # sort the columns to allow us to short-cut searching
    file2_col23.sort()


    line1 = fin1.readline()
    while (line1 != ""):
        col11 = getNthColumn(line1, 11)

        matched = False
        # is col11 is within any file2 row col2 or col3
        if (col11 >= min_c2 and col11 <= max_c3):   # make sure the search is worthwhile
            for col23 in file2_col23:
                (col2, col3) = col23
                if (col11 >= col2 and col11 <= col3):
                    matched = True
                    break

        if (matched == True):
            print("MATCH: "+str(line1))
        else:
            print("NO-MATCH: "+str(line1))

        # Next row
        line1 = fin1.readline()
else:
    print("Give 2 files as arguments")

相关问题更多 >

编程相关推荐

热门问题

热门文章