CuBLASxt矩阵乘法在C++中成功,在Python中失败

2024-04-24 01:29:03 发布

您现在位置:Python中文网/ 问答频道 /正文

我试图在ubuntulinux16.04上用python2.7.14中的ctypess包装cuda9.0中的cublasXt*gemm函数。这些函数接受主机内存中的数组作为其参数。我已经成功地在C++中使用它们:

#include <iostream>
#include <cstdlib>
#include "cublasXt.h"
#include "cuda_runtime_api.h"

void rand_mat(float* &x, int m, int n) {
    x = new float[m*n];
    for (int i=0; i<m; ++i) {
        for (int j=0; j<n; ++j) {
            x[i*n+j] = ((float)rand())/RAND_MAX;
        }
    }
}

int main(void) {
    cublasXtHandle_t handle;
    cublasXtCreate(&handle);

    int devices[1] = {0};
    if (cublasXtDeviceSelect(handle, 1, devices) !=
        CUBLAS_STATUS_SUCCESS) {
        std::cout << "initialization failed" << std::endl; 
        return 1;
    }

    float *a, *b, *c;
    int m = 4, n = 4, k = 4;

    rand_mat(a, m, k);
    rand_mat(b, k, n);
    rand_mat(c, m, n);

    float alpha = 1.0;
    float beta = 0.0;

    if (cublasXtSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
                      m, n, k, &alpha, a, m, b, k, &beta, c, m) != 
           CUBLAS_STATUS_SUCCESS) {
        std::cout << "matrix multiply failed" << std::endl; 
        return 1;
    }
    delete a; delete b; delete c;
    cublasXtDestroy(handle);
}

但是,当我尝试按如下方式用Python包装它们时,在cublasXt*gemm调用中遇到一个segfault:

import ctypes
import numpy as np

_libcublas = ctypes.cdll.LoadLibrary('libcublas.so')
_libcublas.cublasXtCreate.restype = int
_libcublas.cublasXtCreate.argtypes = [ctypes.c_void_p]
_libcublas.cublasXtDestroy.restype = int
_libcublas.cublasXtDestroy.argtypes = [ctypes.c_void_p]
_libcublas.cublasXtDeviceSelect.restype = int
_libcublas.cublasXtDeviceSelect.argtypes = [ctypes.c_void_p,
                                            ctypes.c_int,
                                            ctypes.c_void_p]
_libcublas.cublasXtSgemm.restype = int
_libcublas.cublasXtSgemm.argtypes = [ctypes.c_void_p,
                                     ctypes.c_int,
                                     ctypes.c_int,
                                     ctypes.c_int,
                                     ctypes.c_int,
                                     ctypes.c_int,
                                     ctypes.c_void_p,
                                     ctypes.c_void_p,
                                     ctypes.c_int,
                                     ctypes.c_void_p,
                                     ctypes.c_int,
                                     ctypes.c_void_p,
                                     ctypes.c_void_p,
                                     ctypes.c_int]

handle = ctypes.c_void_p()
_libcublas.cublasXtCreate(ctypes.byref(handle))
deviceId = np.array([0], np.int32)
status = _libcublas.cublasXtDeviceSelect(handle, 1,
                                         deviceId.ctypes.data)
if status:
    raise RuntimeError

a = np.random.rand(4, 4).astype(np.float32)
b = np.random.rand(4, 4).astype(np.float32)
c = np.zeros((4, 4), np.float32)

status = _libcublas.cublasXtSgemm(handle, 0, 0, 4, 4, 4,
                                  ctypes.byref(ctypes.c_float(1.0)),
                                  a.ctypes.data, 4, b.ctypes.data, 4, 
                                  ctypes.byref(ctypes.c_float(0.0)),
                                  c.ctypes.data, 4)
if status:
    raise RuntimeError
print 'success? ', np.allclose(np.dot(a.T, b.T).T, c_gpu.get())
_libcublas.cublasXtDestroy(handle)

奇怪的是,如果我稍微修改一下上面的Python包装器,使其接受已传输到GPU的pycuda.gpuarray.GPUArray矩阵,它们就会工作。为什么在将主机内存传递给函数时只在Python中遇到segfault呢?你知道吗


Tags: ifincludenpfloatctypesintstdhandle
1条回答
网友
1楼 · 发布于 2024-04-24 01:29:03

这些Xt<t>gemm函数的CUBLAS文档中似乎有错误。至少从cuda8开始,参数mnkldaldbldc都是size_t类型。这可以通过查看头文件cublasXt.h来发现。你知道吗

以下对包装器的修改似乎对我来说是正确的:

$ cat t1340.py
import ctypes
import numpy as np

_libcublas = ctypes.cdll.LoadLibrary('libcublas.so')
_libcublas.cublasXtCreate.restype = int
_libcublas.cublasXtCreate.argtypes = [ctypes.c_void_p]
_libcublas.cublasXtDestroy.restype = int
_libcublas.cublasXtDestroy.argtypes = [ctypes.c_void_p]
_libcublas.cublasXtDeviceSelect.restype = int
_libcublas.cublasXtDeviceSelect.argtypes = [ctypes.c_void_p,
                                            ctypes.c_int,
                                            ctypes.c_void_p]
_libcublas.cublasXtSgemm.restype = int
_libcublas.cublasXtSgemm.argtypes = [ctypes.c_void_p,
                                     ctypes.c_int,
                                     ctypes.c_int,
                                     ctypes.c_size_t,
                                     ctypes.c_size_t,
                                     ctypes.c_size_t,
                                     ctypes.c_void_p,
                                     ctypes.c_void_p,
                                     ctypes.c_size_t,
                                     ctypes.c_void_p,
                                     ctypes.c_size_t,
                                     ctypes.c_void_p,
                                     ctypes.c_void_p,
                                     ctypes.c_size_t]

handle = ctypes.c_void_p()
_libcublas.cublasXtCreate(ctypes.byref(handle))
deviceId = np.array([0], np.int32)
status = _libcublas.cublasXtDeviceSelect(handle, 1,
                                         deviceId.ctypes.data)
if status:
    raise RuntimeError

a = np.random.rand(4, 4).astype(np.float32)
b = np.random.rand(4, 4).astype(np.float32)
c = np.zeros((4, 4), np.float32)
alpha = ctypes.c_float(1.0)
beta = ctypes.c_float(0.0)

status = _libcublas.cublasXtSgemm(handle, 0, 0, 4, 4, 4,
                                 ctypes.byref(alpha),
                                 a.ctypes.data, 4, b.ctypes.data, 4,
                                 ctypes.byref(beta),
                                 c.ctypes.data, 4)
if status:
    raise RuntimeError
print 'success? ', np.allclose(np.dot(a.T, b.T).T, c)
_libcublas.cublasXtDestroy(handle)
$ python t1340.py
success?  True
$

列举我所做的更改:

  1. mnkldaldbldccublasXtSgemm参数从c_int更改为c_size_t
  2. 为alpha和beta参数提供了显式变量;这可能是不相关的
  3. np.allclose函数中,将c_gpu.get更改为c

以上在cuda8和cuda9上进行了测试。我已经向NVIDIA提交了一个内部bug来更新文档(即使是当前的cuda9文档也不能反映头文件的当前状态)

相关问题 更多 >