Cython函数比纯python花费更多的时间

2条回答

网友

1楼 · 编辑于 2024-05-15 18:38:46

使用Numba的解决方案

代码外科医生已经用Cython给出了一个很好的答案。在这个答案中，我不想展示另一种使用Numba的方法。在

我创造了三个版本。在naive_numba中，我只添加了一个函数修饰符。在improved_Numba中，我手动组合了这些循环（每个矢量化的命令实际上都是一个循环）。在improved_Numba_p中，我已经并行化了函数。请注意，在使用平行加速器时，显然存在一个不允许定义常量值的错误。还需要注意的是，并行化版本只对较大的输入数组有利。但您也可以添加一个小包装器，根据输入数组的大小调用单线程或并行化版本。在

代码dtype=float64

import numba as nb
import numpy as np
import time



@nb.njit(fastmath=True)
def naive_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6

  delta = u-DustJ
  result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

  x= u/IceI;
  result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

  return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile

#error_model='numpy' sets divison by 0 to NaN instead of throwing a exception, this allows vectorization
@nb.njit(fastmath=True,error_model='numpy')
def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6
  res=np.empty(u.shape[0],dtype=u.dtype)

  for i in range(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

#there is obviously a bug in Numba (declaring const values in the function)
@nb.njit(fastmath=True,parallel=True,error_model='numpy')
def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH):
  res=np.empty((u.shape[0]),dtype=u.dtype)

  for i in nb.prange(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

u=np.array(np.random.rand(1000000),dtype=np.float32)
PorosityProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityIceProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityDustProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6

#don't measure compilation overhead on first call
res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH) 
for i in range(1000):
  res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH)

print(time.time()-t1)
print(time.time()-t1)

性能

^{pr2}$

代码类型=np.浮动32

如果np.浮动32只需将函数中的所有常量值显式声明为float32就足够了。否则Numba将使用float64。在

@nb.njit(fastmath=True,error_model='numpy')
def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6)
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2),  nb.float32(6.4650e4), nb.float32(1.6935e6)
  res=np.empty(u.shape[0],dtype=u.dtype)

  for i in range(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

@nb.njit(fastmath=True,parallel=True,error_model='numpy')
def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  res=np.empty((u.shape[0]),dtype=u.dtype)
  DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6)
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2),  nb.float32(6.4650e4), nb.float32(1.6935e6)

  for i in nb.prange(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

性能

Arraysize np.random.rand(100).astype(np.float32)
Numpy             29.3µs
improved Numba:   1.33µs
improved_Numba_p: 18µs


Arraysize np.random.rand(1000000).astype(np.float32)
Numpy             117ms
improved Numba:   2.46ms
improved_Numba_p: 1.56ms

与@coderegram提供的Cython版本相比并不公平，因为他没有使用启用的AVX2和FMA3指令编译函数。Numba在默认情况下使用-march=native编译，它在我的核心i7-4xxx上启用AVX2和FMA3指令。在

但是如果你不想发布一个编译过的Cython版本的代码，这就很有意义了，因为如果启用了这些优化，它将不会在pre-Haswell处理器（或所有的奔腾和赛扬）上运行。编译多个代码路径应该是可能的，但这取决于编译器，而且需要更多的工作。在

网友

2楼 · 编辑于 2024-05-15 18:38:46

我基本上同意@chepner和@胡安帕.阿里维拉加在评论中。此外，在mpy函数库中，所有的标量运算都是正确的。在

但是，实际上有一种方法可以显著提高cython代码的性能，这要归功于您的特定算法的构造方式，如果我们使用以下假设（并且可以容忍难看的代码）：

您的数组都是一维的，这使得遍历数组中的每个项变得非常简单。我们不需要替换更困难的numpy函数，例如numpy.dot，因为代码中的所有操作都只将标量与矩阵结合起来。在
虽然在python中使用for循环是不可想象的，但在cython中迭代每个索引是非常可行的。此外，最终输出中的每个项只依赖于对应于该项索引的输入（即第0项使用u[0]、PorosityProfile[0]等）。在
您对任何中间数组都不感兴趣，只对compute_python函数中返回的最终结果感兴趣。因此，为什么要浪费时间为所有这些中间numpy数组分配内存呢？在
使用x**y语法的速度非常慢。我使用gcc编译器选项 ffast-math来显著改善这一点。我还使用了几个cython编译器指令来避免python检查和开销。在
创建numpy数组本身可能会有python开销，因此我使用类型化memoryviews（无论如何，这是首选的、更新的语法）和malloc指针的组合来创建输出数组，而不需要与python进行太多交互（只有两行代码，获取输出大小和return语句显示了重要的python交互，如cython注释文件）。在

考虑到所有这些因素，下面是修改后的代码。它的性能比我笔记本电脑上天真的python版本快了近一个数量级。在

升华.pyx

from libc.stdlib cimport malloc, free

def compute_cython(float[:] u, float[:] porosity_profile, 
        float[:] density_ice_profile, float[:] density_dust_profile, 
        float[:] density_profile):    
    cdef:
        float dust_j, dust_f, dust_g, dust_h, dust_i
        float ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h
        int size, i
        float dt, result_dust, x, dust
        float result_ice_numer, result_ice_denom, result_ice, ice
        float* out

    dust_j, dust_f, dust_g, dust_h, dust_i = \
        250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
    ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h = \
        273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
    size = len(u)
    out = <float *>malloc(size * sizeof(float))

    for i in range(size):
        dt = u[i] - dust_j
        result_dust = dust_f + (dust_g*dt) + (dust_h*dt**2) + (dust_i*dt**3)
        x = u[i] / ice_i
        result_ice_numer = x**3*(ice_c + ice_d*x**2 + ice_e*x**6)
        result_ice_denom = 1 + ice_f*x**2 + ice_g*x**4 + ice_h*x**8
        result_ice = result_ice_numer / result_ice_denom
        ice = density_ice_profile[i]*result_ice
        dust = density_dust_profile[i]*result_dust
        out[i] = (dust + ice)/density_profile[i]
    return <float[:size]>out

设置.py

^{pr2}$

主.py

import numpy as np
import sublimation as sub

def compute_python(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
    DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
    IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6
    delta = u-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)
    x = u/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
    return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile

size = 100
u = np.random.rand(size).astype(np.float32)
porosity = np.random.rand(size).astype(np.float32)
ice = np.random.rand(size).astype(np.float32)
dust = np.random.rand(size).astype(np.float32)
density = np.random.rand(size).astype(np.float32)

"""
Run these from the terminal to out the performance!
python3 -m timeit -s "from main import compute_python, u, porosity, ice, dust, density" "compute_python(u, porosity, ice, dust, density)"
python3 -m timeit -s "from main import sub, u, porosity, ice, dust, density" "sub.compute_cython(u, porosity, ice, dust, density)"
python3 -m timeit -s "import numpy as np; from main import sub, u, porosity, ice, dust, density" "np.asarray(sub.compute_cython(u, porosity, ice, dust, density))"

The first command tests the python version. (10000 loops, best of 3: 45.5 usec per loop)
The second command tests the cython version, but returns just a memoryview object. (100000 loops, best of 3: 4.63 usec per loop)
The third command tests the cython version, but converts the result to a ndarray (slower). (100000 loops, best of 3: 6.3 usec per loop)
"""

让我知道，如果在我的解释中，这个答案是如何工作的，我希望它有帮助！在

更新1:

不幸的是，我无法让MSYS2和numba（这取决于LLVM）彼此友好相处，因此我无法进行任何直接比较。然而，按照@max9111的建议，我将-march=native添加到我的setup.py文件中的args列表中；但是，时间安排与之前没有明显不同。在

从this great answer来看，在numpy数组和类型化memoryviews之间的自动转换中，在初始函数调用中（以及在返回语句中，如果您将结果转换回原处）都会发生一些开销。恢复为使用如下函数签名：

ctypedef np.float32_t DTYPE_t
def compute_cython_np(
        np.ndarray[DTYPE_t, ndim=1] u, 
        np.ndarray[DTYPE_t, ndim=1] porosity_profile, 
        np.ndarray[DTYPE_t, ndim=1] density_ice_profile, 
        np.ndarray[DTYPE_t, ndim=1] density_dust_profile, 
        np.ndarray[DTYPE_t, ndim=1] density_profile):

每次调用节省1us，将其减少到3.6us，而不是4.6us，这有点重要，特别是如果函数要多次调用。当然，如果您计划多次调用该函数，那么只传入二维numpy数组可能更有效，这样可以节省大量的python函数调用开销，并分摊numpy array -> typed memoryview转换的成本。此外，使用numpy结构的数组可能会很有趣，它可以在cython中转换为一个类型化的memoryview结构，因为这样可以使缓存中的所有数据更接近，并加快内存访问时间。在

最后，正如前面的评论中所承诺的，这里有一个使用prange的版本，它利用了并行处理。注意，这只能与类型化的memoryviews一起使用，因为python的GIL必须在prange循环中发布（并使用-fopenmpfla进行编译）g代表args和link_args：

from cython.parallel import prange
from libc.stdlib cimport malloc, free
def compute_cython_p(float[:] u, float[:] porosity_profile, 
        float[:] density_ice_profile, float[:] density_dust_profile, 
        float[:] density_profile):    
    cdef:
        float dust_j, dust_f, dust_g, dust_h, dust_i
        float ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h
        int size, i
        float dt, result_dust, x, dust
        float result_ice_numer, result_ice_denom, result_ice, ice
        float* out

    dust_j, dust_f, dust_g, dust_h, dust_i = \
        250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
    ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h = \
        273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
    size = len(u)
    out = <float *>malloc(size * sizeof(float))

    for i in prange(size, nogil=True):
        dt = u[i] - dust_j
        result_dust = dust_f + (dust_g*dt) + (dust_h*dt**2) + (dust_i*dt**3)
        x = u[i] / ice_i
        result_ice_numer = x**3*(ice_c + ice_d*x**2 + ice_e*x**6)
        result_ice_denom = 1 + ice_f*x**2 + ice_g*x**4 + ice_h*x**8
        result_ice = result_ice_numer / result_ice_denom
        ice = density_ice_profile[i]*result_ice
        dust = density_dust_profile[i]*result_dust
        out[i] = (dust + ice)/density_profile[i]
    return <float[:size]>out

更新2:

根据@max9111在注释中提供的非常有用的附加建议，我将代码中的所有float[:]声明切换到float[::1]。这一点的意义在于，它允许数据连续存储，而cython不需要担心元素之间是否存在跨距。这允许SIMD矢量化，从而显著地进一步优化代码。以下是使用以下命令生成的更新的计时：

python3 -m timeit -s "from main import compute_python, u, porosity, ice, dust, density" "compute_python(u, porosity, ice, dust, density)"
python3 -m timeit -s "import numpy as np; from main import sub, u, porosity, ice, dust, density" "np.asarray(sub.compute_cython(u, porosity, ice, dust, density))"
python3 -m timeit -s "import numpy as np; from main import sub, u, porosity, ice, dust, density" "np.asarray(sub.compute_cython_p(u, porosity, ice, dust, density))"

size = 100
python: 44.7 usec per loop
cython serial: 4.44 usec per loop
cython parallel: 111 usec per loop
cython serial contiguous: 3.83 usec per loop
cython parallel contiguous: 116 usec per loop

size = 1000
python: 167 usec per loop
cython serial: 16.4 usec per loop
cython parallel: 115 usec per loop
cython serial contiguous: 8.24 usec per loop
cython parallel contiguous: 111 usec per loop

size = 10000
python: 1.32 msec per loop
cython serial: 128 usec per loop
cython parallel: 142 usec per loop
cython serial contiguous: 55.5 usec per loop
cython parallel contiguous: 150 usec per loop

size = 100000
python: 19.5 msec per loop
cython serial: 1.21 msec per loop
cython parallel: 691 usec per loop
cython serial contiguous: 473 usec per loop
cython parallel contiguous: 274 usec per loop

size = 1000000
python: 211 msec per loop
cython serial: 12.3 msec per loop
cython parallel: 5.74 msec per loop
cython serial contiguous: 4.82 msec per loop
cython parallel contiguous: 1.99 msec per loop

使用Numba的解决方案

相关问题更多 >

编程相关推荐

热门问题

热门文章