当大数组用作输入数据时的CUDA错误

import os,sys,time,math import pandas as pd import numpy as np from numba import cuda, float32 os.environ['NUMBAPRO_NVVM']=r'D:\NVIDIA GPU Computing Toolkit\CUDA\v8.0\nvvm\bin\nvvm64_31_0.dll' os.environ['NUMBAPRO_LIBDEVICE']=r'D:\NVIDIA GPU Computing Toolkit\CUDA\v8.0\nvvm\libdevice' bpg = 8 tpb = (4,32) tsize = (3,4) hsize = (1,4) @cuda.jit def calcu_T(D, T): gw = cuda.gridDim.x bx = cuda.blockIdx.x tx = cuda.threadIdx.x bw = cuda.blockDim.x ty = cuda.threadIdx.y bh = cuda.blockDim.y c_num = D.shape[1] c_index = bx while c_index<c_num*c_num: c_x = int(c_index/c_num) c_y = c_index%c_num if c_x==c_y: T[c_x,c_y] = 0.0 else: X = D[:,c_x] Y = D[:,c_y] hbuf = cuda.shared.array(hsize, float32) h = tx Xi = X[h:] Xi1 = X[:-h] Yih = Y[:-h] sbuf = cuda.shared.array(tsize, float32) L = len(Xi) #mean if ty==0: Xi_m = 0.0 Xi1_m = 0.0 Yih_m = 0.0 for i in range(L): Xi_m += Xi[i] Xi1_m += Xi1[i] Yih_m += Yih[i] Xi_m = Xi_m/L Xi1_m = Xi1_m/L Yih_m = Yih_m/L sbuf[0,tx] = Xi_m sbuf[1,tx] = Xi1_m sbuf[2,tx] = Yih_m cuda.syncthreads() sl = cuda.shared.array(tpb, float32) r_index = ty s_l = 0.0 while r_index<L: s1 = 0.0 for i in range(L): s1 += (Xi[r_index]+Xi1[i])/sbuf[0,tx] s_l += s1 r_index +=bh sl[tx,ty] = s_l cuda.syncthreads() # if ty==0: ht = 0.0 for i in range(bh): ht += sl[tx,i] hbuf[0,tx] = ht/L cuda.syncthreads() #max if tx==0 and ty==0: m_t = 0.0 for index,ele in enumerate(hbuf[0]): if index==0: m_t = ele elif ele>m_t: m_t = ele T[c_x,c_y] = m_t c_index +=gw df = np.random.random_sample((300, 200)) + 10 D = np.array(df, dtype=np.float32) r,c = D.shape T = np.empty([c,c]) dD = cuda.to_device(D) dT = cuda.device_array_like(T) calcu_T[bpg, tpb](dD,dT) dT.copy_to_host(T)

Device 0: CUDA Driver Version / Runtime Version 8.0 / 8.0 CUDA Capability Major/Minor version number: 5.0 Total amount of global memory: 2048 MBytes (2147483648 bytes) ( 5) Multiprocessors, (128) CUDA Cores/MP: 640 CUDA Cores Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096) Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers Total amount of constant memory: 65536 bytes Total amount of shared memory per block: 49152 bytes Total number of registers available per block: 65536 Warp size: 32 Maximum number of threads per multiprocessor: 2048 Maximum number of threads per block: 1024 Max dimension size of a thread block (x,y,z): (1024, 1024, 64) Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535) Maximum memory pitch: 2147483647 bytes Texture alignment: 512 bytes

1条回答

网友

1楼 · 发布于 2024-05-29 10:58:26

你的代码没有问题。如果我在GTX970上运行你的代码，我会得到：

In [11]: main??
Signature: main()
Source:   
def main():

    df = np.random.random_sample((300, 200)) + 10
    D = np.array(df, dtype=np.float32)
    r,c = D.shape

    T = np.empty([c,c])

    dD = cuda.to_device(D)
    dT = cuda.device_array_like(T)

    calcu_T[bpg, tpb](dD,dT)
    dT.copy_to_host(T)
File:      ~/SO/crash.py
Type:      function

In [12]: %timeit -n 3 -r 3 main()
3 loops, best of 3: 6.61 s per loop

也就是说，没有运行时错误，但是包含内核的python代码需要6.6秒才能运行。如果我用CUDA分析器分析代码：

^{pr2}$

您可以看到您发布的内核需要6.5秒才能运行。在

您没有提供详细信息，但我猜您是在Windows上运行的，您的GPU是一个显示GPU，并且您的代码运行速度非常慢，以至于它达到了WDDM显示管理器看门狗超时限制。这是一个非常好的文档，并且已经被问过几百次了，例如here。在

你所选择的搜索引擎和CUDA Windows入门指南将为你提供从操作系统和硬件角度改善情况的替代方案的信息。然而，最明显的是改进代码，使其运行更快。在

相关问题更多 >

编程相关推荐

热门问题

热门文章