通过阅读PyCUDA文档、示例以及Kirk和Hwu关于CUDA的书,我成功地实现了一个基于CUDA C的复数矩阵乘法程序,并用PyCUDA编写了一个版本。C代码生成正确的结果,但是Python代码没有
更清楚地说,Python代码只是从示例(MatrixMulTiled)中获取的,并且已经修改为使用“cuComplex.h”中的cuComplexFloat来处理复数。在此修改之前,它是正确的实值矩阵相乘。在
所以我无法找出错误。Python代码是
# attempt to do matrix multiplication for complex numbers
import pycuda.autoinit
from pycuda import driver, compiler, gpuarray, tools
import numpy as np
from time import *
kernel_code_template = """
#include <cuComplex.h>
__global__ void MatrixMulKernel(cuFloatComplex *A, cuFloatComplex *B, cuFloatComplex *C)
{
const uint wA = %(MATRIX_SIZE)s;
const uint wB = %(MATRIX_SIZE)s;
// Block index
const uint bx = blockIdx.x;
const uint by = blockIdx.y;
// Thread index
const uint tx = threadIdx.x;
const uint ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block
const uint aBegin = wA * %(BLOCK_SIZE)s * by;
// Index of the last sub-matrix of A processed by the block
const uint aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A
const uint aStep = %(BLOCK_SIZE)s;
// Index of the first sub-matrix of B processed by the block
const int bBegin = %(BLOCK_SIZE)s * bx;
// Step size used to iterate through the sub-matrcies of B
const uint bStep = %(BLOCK_SIZE)s * wB;
// The element of the block sub-matrix that is computed by the thread
cuFloatComplex Csub = make_cuFloatComplex(0,0);
// Loop over all the sub-matrices of A and B required to compute the block sub-matrix
for (int a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep)
{
// Shared memory for the sub-matrix of A
__shared__ cuFloatComplex As[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];
// Shared memory for the sub-matrix of B
__shared__ cuFloatComplex Bs[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];
// Load the matrices from global memory to shared memory;
// each thread loads one element of each matrix
As[ty][tx] = make_cuFloatComplex(cuCrealf(A[a + wA*ty + tx]),cuCimagf(A[a + wA*ty + tx]));
Bs[ty][tx] = make_cuFloatComplex(cuCrealf(B[b + wB*ty + tx]),cuCimagf(B[b + wA*ty + tx]));
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrcies together
// each thread computes one element of the block sub-matrix
for(int k = 0; k < %(BLOCK_SIZE)s; ++k)
{
Csub = cuCaddf(Csub,cuCmulf(As[ty][k],Bs[k][tx]));
}
// Synchronize to make sure that the preceding computation
// is done before loading two new sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to global memory
// each thread writes one element
const uint c = wB * %(BLOCK_SIZE)s * by + %(BLOCK_SIZE)s * bx;
C[c + wB*ty + tx] = make_cuFloatComplex(cuCrealf(Csub), cuCimagf(Csub));
}
"""
MATRIX_SIZE = 4
TILE_SIZE = 2
BLOCK_SIZE = TILE_SIZE
a_cpu = np.zeros(shape=(MATRIX_SIZE,MATRIX_SIZE)).astype(np.complex)
b_cpu = np.zeros(shape=(MATRIX_SIZE,MATRIX_SIZE)).astype(np.complex)
a_cpu[:,:] = 1 + 1j*0
b_cpu[:,:] = 1 + 1j*2
# compute reference on the CPU to verify GPU computation
t1 = time()
c_cpu = np.dot(a_cpu, b_cpu)
t2 = time()
t_cpu = t2-t1
# transfer host (CPU) memory to device (GPU) memory
a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
# create empty gpuarry for the result (C = A * B)
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.complex)
# get the kernel code from the template
# by specifying the constant MATRIX_SIZE
kernel_code = kernel_code_template % {
'MATRIX_SIZE': MATRIX_SIZE,
'BLOCK_SIZE': BLOCK_SIZE,
}
# compile the kernel code
mod = compiler.SourceModule(kernel_code)
# get the kernel function from the compiled module
matrixmul = mod.get_function("MatrixMulKernel")
# call the kernel on the card
t1 = time()
matrixmul(
# inputs
a_gpu, b_gpu,
# output
c_gpu,
# grid of multiple blocks
grid = (MATRIX_SIZE/TILE_SIZE, MATRIX_SIZE/TILE_SIZE),
# block of multiple threads
block = (TILE_SIZE, TILE_SIZE, 1),
)
t2 = time()
t_gpu = t2-t1
# print the results
print("-" * 80)
print("Matrix A (GPU): ")
print(a_gpu.get())
print("-" * 80)
print("Matrix B (GPU): ")
print(b_gpu.get())
print("-" * 80)
print("Matrix C (GPU): ")
print(c_gpu.get())
print("-" * 80)
print("Matrix C (CPU): ")
print(c_cpu)
print("-" * 80)
print("CPU-GPU Difference: ")
print(c_cpu-c_gpu.get())
print("CPU Time ", t_cpu)
print("GPU Time ", t_gpu)
np.allclose(c_cpu, c_gpu.get() )
C代码是
^{pr2}$Python代码的输出是
Matrix C (GPU):
[[ 1.59878214e-314 +1.59926782e-314j 1.59878214e-314 +1.59926782e-314j
1.59878214e-314 +1.59926782e-314j 1.59878214e-314 +1.59926782e-314j]
[ 1.59878214e-314 +1.59926782e-314j 1.59878214e-314 +1.59926782e-314j
1.59878214e-314 +1.59926782e-314j 1.59878214e-314 +1.59926782e-314j]
[ -9.01080877e+306 -5.19870527e+306j -1.45379609e+307 -8.65694841e+306j
-4.14125486e+306 -2.15325816e+306j -5.83708063e+306 -3.25935506e+306j]
[ -1.44828853e+306 -1.44828853e+306j -2.32949855e+306 -2.32949855e+306j
-3.78945180e+306 -3.78945180e+306j -6.54203686e+306 -6.54203686e+306j]]
--------------------------------------------------------------------------------
Matrix C (CPU):
[[ 4.+8.j 4.+8.j 4.+8.j 4.+8.j]
[ 4.+8.j 4.+8.j 4.+8.j 4.+8.j]
[ 4.+8.j 4.+8.j 4.+8.j 4.+8.j]
[ 4.+8.j 4.+8.j 4.+8.j 4.+8.j]]
C输出是
Matrix P is
4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000
4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000
4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000
4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000
如果有人能指出我的Python代码中的错误,我将不胜感激。我正努力在最后期限前完成我的论文,而我剩下的代码都是用Python编写的,所以我没有时间把它移植到C
谢谢!在
======================
这很可能是一个精度问题。我修复了它替换主机传输和空矩阵创建代码如下。。。在
# transfer host (CPU) memory to device (GPU) memory
a_gpu = gpuarray.to_gpu(a_cpu.astype(np.complex64))
b_gpu = gpuarray.to_gpu(b_cpu.astype(np.complex64))
# create empty gpuarry for the result (C = A * B)
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.complex64)
希望这对你有帮助。在
这很可能是一个精度问题。我修复了它替换主机传输和空矩阵创建代码如下。。。在
但我确实发现了使用PyCUDA乘法大型复值矩阵的问题。例如,使用MATRIX_SIZE=5040和TILE_SIZE=16(从硬件的角度来看,这可能不是一个好的选择),CUDA-C确实设法使矩阵相乘,但是PyCUDA崩溃了。为什么要这样?在
相关问题 更多 >
编程相关推荐