numpy数组的顺序如何影响乘法速度？

Question

numpy数组的排列顺序是如何影响乘法速度的？我该如何根据矩阵的大小自动选择排列顺序呢？

这个问题最初是源于使用cudamat的代码：

def test_mat():
    #need to init cublas?
    # cm.cublas_init()

    n = 1024

    for i in xrange(1,20):  # 2^15 max or python fails
        m= 2
        m=m**i
        # print m
        print i
        try:
            t0= time.time()
            # cpum1 = np.array(np.random.rand(n, m)*10, dtype=np.float32, order='C')
            # cpum2 = np.array(np.random.rand(m, 1)*10, dtype=np.float32, order='C')
            #CUDA need fortran order of array for speed?
            cpum1 = np.array(np.random.rand(n, m)*10, dtype=np.float32, order='F')
            cpum2 = np.array(np.random.rand(m, 1)*10, dtype=np.float32, order='F')
            c = np.dot(cpum2.T, cpum1.T)
            print (time.time()-t0)

            t0= time.time()
            gpum1 = cm.CUDAMatrix(cpum1)
            gpum2 = cm.CUDAMatrix(cpum2)
            gm = cm.dot(gpum2.T, gpum1.T)
            gm.copy_to_host()
            print (time.time()-t0)
        except:
            pass

    # cm.cublas_shutdown()

    print 'done'

这是我做的一些测试，但我需要一些理论上的观点。

def test_order(m,n):            
    #default
    a = np.array(np.random.rand(m, n)*10, dtype=np.float32)
    b = np.array(np.random.rand(n, m)*10, dtype=np.float32)

    t0= time.time()
    c = np.dot(a,b)
    print (time.time()-t0)

    #1
    a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='C')
    b = np.array(np.random.rand(n, m)*10, dtype=np.float32, order='C')

    t0= time.time()
    c = np.dot(a,b)
    print (time.time()-t0)

    #2
    a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='C')
    b = np.array(np.random.rand(n, m)*10, dtype=np.float32, order='F')

    t0= time.time()
    c = np.dot(a,b)
    print (time.time()-t0)

    #3
    a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
    b = np.array(np.random.rand(n, m)*10, dtype=np.float32, order='C')

    t0= time.time()
    c = np.dot(a,b)
    print (time.time()-t0)

    #4
    a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
    b = np.array(np.random.rand(n, m)*10, dtype=np.float32, order='F')

    t0= time.time()
    c = np.dot(a,b)
    print (time.time()-t0)


    print 'done'    

m= 1024*10
n= 1024*1
7.125
7.14100003242
6.95299983025
8.14100003242
7.15600013733

m= 1024*1
n= 1024*10  
0.718999862671
0.734000205994
0.641000032425
0.656000137329
0.655999898911

这里是测试峰值内存使用情况的代码：

import numpy as np
import time
from memory_profiler import profile

@profile    
def test_order_():

    m= 1024*1
    n= 1024*10

    #what used by default when c= np.dot(a,b)
    c = np.array(np.zeros((m, m)), dtype=np.float32, order='C')
    #c = np.array(np.zeros((m, m)), dtype=np.float32, order='F')

    #1
    a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='C')
    b = np.array(np.random.rand(n, m)*10, dtype=np.float32, order='C')

    t0= time.time()
    c[:]= np.dot(a,b)
    # np.dot(a,b,out= c) # only for C-Array !
    print (time.time()-t0)

    del a
    del b
    # del c

    #2
    a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='C')
    b = np.array(np.random.rand(n, m)*10, dtype=np.float32, order='F')

    t0= time.time()
    c[:]= np.dot(a,b)
    # np.dot(a,b,out= c) # only for C-Array !
    print (time.time()-t0)

    del a
    del b
    # del c

    #3
    a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
    b = np.array(np.random.rand(n, m)*10, dtype=np.float32, order='C')

    t0= time.time()
    c[:]= np.dot(a,b)
    # np.dot(a,b,out= c) # only for C-Array !
    print (time.time()-t0)

    del a
    del b
    # del c

    #4
    a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
    b = np.array(np.random.rand(n, m)*10, dtype=np.float32, order='F')

    t0= time.time()
    c[:]= np.dot(a,b)
    # np.dot(a,b,out= c) # only for C-Array !
    print (time.time()-t0)

    del a
    del b
    # del c

    print 'done'

if __name__ == '__main__':
    test_order_()

我还找到了一些关于numpy.dot复制和fast_dot的信息。

dot的内部工作原理有点复杂，因为它试图使用BLAS优化的例程，这有时需要数组以Fortran顺序排列。

还有一些性能提示，这很奇怪，但我每次运行示例时无法重现结果。（也许在重新运行之前某些数据被缓存了？）

性能优化 numpy 矩阵乘法内存使用数组排列 blaas fast_dot fortran顺序

numpy数组的顺序如何影响乘法速度？

2 个回答

撰写回答