Cython函数比纯python花费更多的时间

2024-05-15 18:38:46 发布

您现在位置:Python中文网/ 问答频道 /正文

我试图加速我的代码,但这部分代码给我带来了问题

我尝试使用Cython,然后遵循给定的建议here,但是我的纯python函数比Cython和Cython优化的函数性能都要好

cython代码如下:

import numpy as np
cimport numpy as np

DTYPE = np.float
ctypedef np.float_t DTYPE_t

cimport cython
@cython.boundscheck(False)
@cython.wraparound(False) 

def compute_cython(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):

    DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
    IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6

    delta = u-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    x= u/IceI;
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile


def compute_cythonOptimized(np.ndarray[DTYPE_t, ndim=1] u, np.ndarray[DTYPE_t, ndim=1] PorosityProfile, np.ndarray[DTYPE_t, ndim=1] DensityIceProfile, np.ndarray[DTYPE_t, ndim=1] DensityDustProfile, np.ndarray DensityProfile):

    assert u.dtype == DTYPE
    assert PorosityProfile.dtype == DTYPE
    assert DensityIceProfile.dtype == DTYPE
    assert DensityDustProfile.dtype == DTYPE
    assert DensityProfile.dtype == DTYPE

    cdef float DustJ = 250.0
    cdef float DustF = 633.0  
    cdef float DustG = 2.513 
    cdef float DustH = -2.2e-3   
    cdef float DustI = -2.8e-6 
    cdef float IceI =  273.16
    cdef float IceC =  1.843e5 
    cdef float IceD =  1.6357e8 
    cdef float IceE =  3.5519e9 
    cdef float IceF =  1.6670e2 
    cdef float IceG =  6.4650e4
    cdef float IceH =  1.6935e6

    cdef np.ndarray[DTYPE_t, ndim=1] delta = u-DustJ
    cdef np.ndarray[DTYPE_t, ndim=1] result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    cdef np.ndarray[DTYPE_t, ndim=1] x= u/IceI;
    cdef np.ndarray[DTYPE_t, ndim=1] result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile

然后运行以下命令:

^{pr2}$

结果如下:

对于纯python:68.9 µs ± 851 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

对于非优化cython:68.2 µs ± 685 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

对于优化的一个:72.7 µs ± 416 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

我做错什么了?在

谢谢你的帮助


Tags: npassertresultfloatcythondeltandarraydtype
2条回答

使用Numba的解决方案

代码外科医生已经用Cython给出了一个很好的答案。在这个答案中,我不想展示另一种使用Numba的方法。在

我创造了三个版本。在naive_numba中,我只添加了一个函数修饰符。在improved_Numba中,我手动组合了这些循环(每个矢量化的命令实际上都是一个循环)。在improved_Numba_p中,我已经并行化了函数。请注意,在使用平行加速器时,显然存在一个不允许定义常量值的错误。还需要注意的是,并行化版本只对较大的输入数组有利。但您也可以添加一个小包装器,根据输入数组的大小调用单线程或并行化版本。在

代码dtype=float64

import numba as nb
import numpy as np
import time



@nb.njit(fastmath=True)
def naive_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6

  delta = u-DustJ
  result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

  x= u/IceI;
  result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

  return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile

#error_model='numpy' sets divison by 0 to NaN instead of throwing a exception, this allows vectorization
@nb.njit(fastmath=True,error_model='numpy')
def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6
  res=np.empty(u.shape[0],dtype=u.dtype)

  for i in range(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

#there is obviously a bug in Numba (declaring const values in the function)
@nb.njit(fastmath=True,parallel=True,error_model='numpy')
def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH):
  res=np.empty((u.shape[0]),dtype=u.dtype)

  for i in nb.prange(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

u=np.array(np.random.rand(1000000),dtype=np.float32)
PorosityProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityIceProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityDustProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6

#don't measure compilation overhead on first call
res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH) 
for i in range(1000):
  res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH)

print(time.time()-t1)
print(time.time()-t1)

性能

^{pr2}$

代码类型=np.浮动32

如果np.浮动32只需将函数中的所有常量值显式声明为float32就足够了。否则Numba将使用float64。在

@nb.njit(fastmath=True,error_model='numpy')
def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6)
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2),  nb.float32(6.4650e4), nb.float32(1.6935e6)
  res=np.empty(u.shape[0],dtype=u.dtype)

  for i in range(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

@nb.njit(fastmath=True,parallel=True,error_model='numpy')
def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  res=np.empty((u.shape[0]),dtype=u.dtype)
  DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6)
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2),  nb.float32(6.4650e4), nb.float32(1.6935e6)

  for i in nb.prange(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

性能

Arraysize np.random.rand(100).astype(np.float32)
Numpy             29.3µs
improved Numba:   1.33µs
improved_Numba_p: 18µs


Arraysize np.random.rand(1000000).astype(np.float32)
Numpy             117ms
improved Numba:   2.46ms
improved_Numba_p: 1.56ms

与@coderegram提供的Cython版本相比并不公平,因为他没有使用启用的AVX2和FMA3指令编译函数。Numba在默认情况下使用-march=native编译,它在我的核心i7-4xxx上启用AVX2和FMA3指令。在

但是如果你不想发布一个编译过的Cython版本的代码,这就很有意义了,因为如果启用了这些优化,它将不会在pre-Haswell处理器(或所有的奔腾和赛扬)上运行。编译多个代码路径应该是可能的,但这取决于编译器,而且需要更多的工作。在

我基本上同意@chepner和@胡安帕.阿里维拉加在评论中。此外,在mpy函数库中,所有的标量运算都是正确的。在

但是,实际上有一种方法可以显著提高cython代码的性能,这要归功于您的特定算法的构造方式,如果我们使用以下假设(并且可以容忍难看的代码):

  • 您的数组都是一维的,这使得遍历数组中的每个项变得非常简单。我们不需要替换更困难的numpy函数,例如numpy.dot,因为代码中的所有操作都只将标量与矩阵结合起来。在
  • 虽然在python中使用for循环是不可想象的,但在cython中迭代每个索引是非常可行的。此外,最终输出中的每个项只依赖于对应于该项索引的输入(即第0项使用u[0]PorosityProfile[0]等)。在
  • 您对任何中间数组都不感兴趣,只对compute_python函数中返回的最终结果感兴趣。因此,为什么要浪费时间为所有这些中间numpy数组分配内存呢?在
  • 使用x**y语法的速度非常慢。我使用gcc编译器选项 ffast-math来显著改善这一点。我还使用了几个cython编译器指令来避免python检查和开销。在
  • 创建numpy数组本身可能会有python开销,因此我使用类型化memoryviews(无论如何,这是首选的、更新的语法)和malloc指针的组合来创建输出数组,而不需要与python进行太多交互(只有两行代码,获取输出大小和return语句显示了重要的python交互,如cython注释文件)。在

考虑到所有这些因素,下面是修改后的代码。它的性能比我笔记本电脑上天真的python版本快了近一个数量级。在

升华.pyx

from libc.stdlib cimport malloc, free

def compute_cython(float[:] u, float[:] porosity_profile, 
        float[:] density_ice_profile, float[:] density_dust_profile, 
        float[:] density_profile):    
    cdef:
        float dust_j, dust_f, dust_g, dust_h, dust_i
        float ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h
        int size, i
        float dt, result_dust, x, dust
        float result_ice_numer, result_ice_denom, result_ice, ice
        float* out

    dust_j, dust_f, dust_g, dust_h, dust_i = \
        250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
    ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h = \
        273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
    size = len(u)
    out = <float *>malloc(size * sizeof(float))

    for i in range(size):
        dt = u[i] - dust_j
        result_dust = dust_f + (dust_g*dt) + (dust_h*dt**2) + (dust_i*dt**3)
        x = u[i] / ice_i
        result_ice_numer = x**3*(ice_c + ice_d*x**2 + ice_e*x**6)
        result_ice_denom = 1 + ice_f*x**2 + ice_g*x**4 + ice_h*x**8
        result_ice = result_ice_numer / result_ice_denom
        ice = density_ice_profile[i]*result_ice
        dust = density_dust_profile[i]*result_dust
        out[i] = (dust + ice)/density_profile[i]
    return <float[:size]>out

设置.py

^{pr2}$

主.py

import numpy as np
import sublimation as sub

def compute_python(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
    DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
    IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6
    delta = u-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)
    x = u/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
    return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile

size = 100
u = np.random.rand(size).astype(np.float32)
porosity = np.random.rand(size).astype(np.float32)
ice = np.random.rand(size).astype(np.float32)
dust = np.random.rand(size).astype(np.float32)
density = np.random.rand(size).astype(np.float32)

"""
Run these from the terminal to out the performance!
python3 -m timeit -s "from main import compute_python, u, porosity, ice, dust, density" "compute_python(u, porosity, ice, dust, density)"
python3 -m timeit -s "from main import sub, u, porosity, ice, dust, density" "sub.compute_cython(u, porosity, ice, dust, density)"
python3 -m timeit -s "import numpy as np; from main import sub, u, porosity, ice, dust, density" "np.asarray(sub.compute_cython(u, porosity, ice, dust, density))"

The first command tests the python version. (10000 loops, best of 3: 45.5 usec per loop)
The second command tests the cython version, but returns just a memoryview object. (100000 loops, best of 3: 4.63 usec per loop)
The third command tests the cython version, but converts the result to a ndarray (slower). (100000 loops, best of 3: 6.3 usec per loop)
"""

让我知道,如果在我的解释中,这个答案是如何工作的,我希望它有帮助!在


更新1:

不幸的是,我无法让MSYS2和numba(这取决于LLVM)彼此友好相处,因此我无法进行任何直接比较。然而,按照@max9111的建议,我将-march=native添加到我的setup.py文件中的args列表中;但是,时间安排与之前没有明显不同。在

this great answer来看,在numpy数组和类型化memoryviews之间的自动转换中,在初始函数调用中(以及在返回语句中,如果您将结果转换回原处)都会发生一些开销。恢复为使用如下函数签名:

ctypedef np.float32_t DTYPE_t
def compute_cython_np(
        np.ndarray[DTYPE_t, ndim=1] u, 
        np.ndarray[DTYPE_t, ndim=1] porosity_profile, 
        np.ndarray[DTYPE_t, ndim=1] density_ice_profile, 
        np.ndarray[DTYPE_t, ndim=1] density_dust_profile, 
        np.ndarray[DTYPE_t, ndim=1] density_profile):

每次调用节省1us,将其减少到3.6us,而不是4.6us,这有点重要,特别是如果函数要多次调用。当然,如果您计划多次调用该函数,那么只传入二维numpy数组可能更有效,这样可以节省大量的python函数调用开销,并分摊numpy array -> typed memoryview转换的成本。此外,使用numpy结构的数组可能会很有趣,它可以在cython中转换为一个类型化的memoryview结构,因为这样可以使缓存中的所有数据更接近,并加快内存访问时间。在

最后,正如前面的评论中所承诺的,这里有一个使用prange的版本,它利用了并行处理。注意,这只能与类型化的memoryviews一起使用,因为python的GIL必须在prange循环中发布(并使用-fopenmpfla进行编译)g代表argslink_args

from cython.parallel import prange
from libc.stdlib cimport malloc, free
def compute_cython_p(float[:] u, float[:] porosity_profile, 
        float[:] density_ice_profile, float[:] density_dust_profile, 
        float[:] density_profile):    
    cdef:
        float dust_j, dust_f, dust_g, dust_h, dust_i
        float ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h
        int size, i
        float dt, result_dust, x, dust
        float result_ice_numer, result_ice_denom, result_ice, ice
        float* out

    dust_j, dust_f, dust_g, dust_h, dust_i = \
        250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
    ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h = \
        273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
    size = len(u)
    out = <float *>malloc(size * sizeof(float))

    for i in prange(size, nogil=True):
        dt = u[i] - dust_j
        result_dust = dust_f + (dust_g*dt) + (dust_h*dt**2) + (dust_i*dt**3)
        x = u[i] / ice_i
        result_ice_numer = x**3*(ice_c + ice_d*x**2 + ice_e*x**6)
        result_ice_denom = 1 + ice_f*x**2 + ice_g*x**4 + ice_h*x**8
        result_ice = result_ice_numer / result_ice_denom
        ice = density_ice_profile[i]*result_ice
        dust = density_dust_profile[i]*result_dust
        out[i] = (dust + ice)/density_profile[i]
    return <float[:size]>out

更新2:

根据@max9111在注释中提供的非常有用的附加建议,我将代码中的所有float[:]声明切换到float[::1]。这一点的意义在于,它允许数据连续存储,而cython不需要担心元素之间是否存在跨距。这允许SIMD矢量化,从而显著地进一步优化代码。以下是使用以下命令生成的更新的计时:

python3 -m timeit -s "from main import compute_python, u, porosity, ice, dust, density" "compute_python(u, porosity, ice, dust, density)"
python3 -m timeit -s "import numpy as np; from main import sub, u, porosity, ice, dust, density" "np.asarray(sub.compute_cython(u, porosity, ice, dust, density))"
python3 -m timeit -s "import numpy as np; from main import sub, u, porosity, ice, dust, density" "np.asarray(sub.compute_cython_p(u, porosity, ice, dust, density))"

size = 100
python: 44.7 usec per loop
cython serial: 4.44 usec per loop
cython parallel: 111 usec per loop
cython serial contiguous: 3.83 usec per loop
cython parallel contiguous: 116 usec per loop

size = 1000
python: 167 usec per loop
cython serial: 16.4 usec per loop
cython parallel: 115 usec per loop
cython serial contiguous: 8.24 usec per loop
cython parallel contiguous: 111 usec per loop

size = 10000
python: 1.32 msec per loop
cython serial: 128 usec per loop
cython parallel: 142 usec per loop
cython serial contiguous: 55.5 usec per loop
cython parallel contiguous: 150 usec per loop

size = 100000
python: 19.5 msec per loop
cython serial: 1.21 msec per loop
cython parallel: 691 usec per loop
cython serial contiguous: 473 usec per loop
cython parallel contiguous: 274 usec per loop

size = 1000000
python: 211 msec per loop
cython serial: 12.3 msec per loop
cython parallel: 5.74 msec per loop
cython serial contiguous: 4.82 msec per loop
cython parallel contiguous: 1.99 msec per loop

相关问题 更多 >