如何使这种转换更有效？

import allel import pandas as pd import numpy as np import os callset = allel.read_vcf("/DATABASES/1KGphase3VCF/ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz") geno = (callset['calldata/GT']) index = list(callset['variants/POS']) samples = list(callset['samples']) df = pd.DataFrame(columns=samples) # df["Index"] = index # df = df.set_index('Index') geno_list = [] for i in range(0, len(geno[:, 0])): for k in range(0, len(geno[1, :])): if geno[i, k, 0] == 0 and geno[i, k, 1] == 0: geno_list.append(2) elif geno[i, k, 0] == 1 and geno[i, k, 1] == 0 or geno[i, k, 0] == 0 and geno[i, k, 1] == 1: geno_list.append(1) elif geno[i, k, 0] == 1 and geno[i, k, 1] == 1: geno_list.append(0) else: geno_list.append(9) df.loc[i] = geno_list geno_list = [] numpy_array = df.to_numpy() NEWLINE_SIZE_IN_BYTES = -1 # on linux -2 on Windows? with open('test_chr22.geno', 'wb') as fout: # Note 'wb' instead of 'w' np.savetxt(fout, numpy_array, delimiter="", fmt='%d') fout.seek(NEWLINE_SIZE_IN_BYTES, 2) fout.truncate()

1条回答

网友

1楼 · 发布于 2024-05-23 14:23:23

有几种方法可以加速代码并简化某些计算：

import allel
# just in time compilation for numpy calculations
# import numba as nb 
import numpy as np
# pandas import completely unnecessary -> numpy is sufficient


# @nb.njit
def create_mat(geno):
    # create matrix as np.uint8 (1 byte) instead of list of python integers (8 byte)
    # also no need to dynamically resize / increase list size
    geno_mat = np.zeros((len(geno[:, 0]), len(geno[1, :])), dtype=np.uint8)
    
    for i in np.arange(len(geno[:, 0])):
        for k in np.arange(len(geno[1, :])):
            g = geno[i, k]

            # nested ifs to avoid duplicate comparisons
            if g[0] == 0:
                if g[1] == 0:
                    geno_mat[i, k] = 2
                elif g[1] == 1:
                    geno_mat[i, k] = 1
                else:
                    geno_mat[i, k] = 9
            elif g[0] == 1:
                if g[1] == 0:
                    geno_mat[i, k] = 1
                elif g[1] == 1:
                    geno_mat[i, k] = 0
                else:
                    geno_mat[i, k] = 9
            else:
                geno_mat[i, k] = 9
    return geno_mat
    

# you really only need to read the "calldata/GT" fields - everything else was never used
# created a subset of the data using just the first 1000 columns
geno = allel.read_vcf("genomes_first_1000.vcf", fields=("calldata/GT",))["calldata/GT"]
mat = create_mat(geno)

测试时间：

In [7]: %timeit original(geno)
10.8 s ± 154 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [10]: %timeit optimized(geno)
5.8 s ± 49.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [19]: %timeit optimized_numba(geno)
6.65 ms ± 43.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

只要用numpy重写代码，我们就可以看到几乎2的加速。包括numba的jit编译，我们可以实现约1500的加速

相关问题更多 >

编程相关推荐

热门问题

热门文章