使用Python计算大协方差矩阵
我正在使用numpy.memmap来计算一个很大的协方差矩阵。
我的问题是,为什么在第二种情况下(当我把A的转置存储在磁盘上)乘法会更慢呢?
def matrix_append_row(fm,tp,mat):
#check if number of columns is the same
rows= fm.shape[0] + mat.shape[0]
new_fm = np.memmap(fm.filename, mode='r+', dtype= tp, shape=(rows,fm.shape[1]))
new_fm[fm.shape[0]:rows,:]= mat[:]
return new_fm
def generate_and_store_data(cols,batch,iter,tp):
#create memmap file and append generated data in cycle
#can't create empty array - need to store first entry by copy not by append
fm= np.memmap('A.npy', dtype=tp, mode='w+', shape=(batch,cols))
for i in xrange(iter):
data= np.random.rand(batch,cols)*100 # [0-1]*100
# print i
# t0= time.time()
if i==0:
fm[:]= data[:]
else:
fm= matrix_append_row(fm,tp,data)
# print (time.time()-t0)
# fm.flush()
# print fm.shape
return fm
A= generate_and_store_data(1000,5000,10,'float32')
#cov= A.T*A
# A memmaped file
print A.shape
M= A.shape[0]
N= A.shape[1]
#create cov mat
cov= np.memmap('cov.npy', dtype='float32', mode='w+', shape=(N,N))
#A.T don't copy data just view?
t0= time.time()
np.dot(A.T,A,out= cov)
print (time.time()-t0)
print A.T.shape
cov_= np.memmap('cov_.npy', dtype='float32', mode='w+', shape=(N,N))
At= np.memmap('At.npy', dtype='float32', mode='w+', shape=(N,M))
t0= time.time()
At= A.T # only reference
print (time.time()-t0)
t0= time.time()
At[:]= A.T # copy same as At[:]= (A.T)[:] ?
print (time.time()-t0)
t0= time.time()
At[:]= (A.T)[:]
print (time.time()-t0)
t0= time.time()
np.dot(At,A,out= cov_)
print (time.time()-t0)
输出是
(50000, 1000)
2.84399986267
(1000, 50000)
0.0
2.17100000381
2.07899999619
2.73399996758
更新:
我也尝试了分块矩阵乘法,但速度非常慢(可能我需要调整块大小)
cov2= np.memmap('cov2.npy', dtype='float32', mode='w+', shape=(N,N))
block_size=100
t0= time.time()
for i_start in range(0, At.shape[0], block_size):
for j_start in range(0, A.shape[1], block_size):
for k_start in range(0, At.shape[1], block_size):
cov2[i_start:i_start+block_size, j_start:j_start + block_size] +=np.dot(At[i_start:i_start + block_size, k_start:k_start + block_size],A[k_start:k_start + block_size, j_start:j_start + block_size])
print (time.time()-t0)
1 个回答
2
所以对于一个较小的数组,即使内存布局的变化也影响了时间,我得到的结果是这样的:
arr = numpy.random.rand(500, 100)
arr *= 100
%timeit numpy.dot(arr.T, arr)
100 loops, best of 3: 7.52 ms per loop
%timeit numpy.dot(numpy.asfortranarray(arr.T), arr)
100 loops, best of 3: 7.53 ms per loop
%timeit numpy.dot(arr.T, numpy.asfortranarray(arr))
100 loops, best of 3: 5.26 ms per loop
我不太明白你发的那个大小为什么运行时间差别这么大,但这是我得到的结果:
arr = numpy.random.rand(50000, 1000)
arr *= 100
cov = numpy.zeros((1000, 1000))
%timeit numpy.dot(arr.T, arr, out=cov)
1 loops, best of 3: 12min 45s per loop
%timeit numpy.dot(numpy.asfortranarray(arr.T), arr, out=cov)
1 loops, best of 3: 12min 42s per loop
%timeit numpy.dot(arr.T, numpy.asfortranarray(arr), out=cov)
1 loops, best of 3: 7min 19s per loop
当然,这也很大程度上取决于你用什么BLAS实现来编译numpy,不过改变内存布局似乎是很值得的。