奇偶排序：在CUDA中使用多个块时结果不正确

bricksort_src = """ __global__ void bricksort(int *in, int *out, int n){ int tid = threadIdx.x + (blockIdx.x * blockDim.x); if((tid * 2) < n) out[tid * 2] = in[tid *2]; if((tid * 2 + 1) < n) out[tid * 2 + 1] = in[tid * 2 + 1]; __syncthreads(); // odd and even are used for adjusting the index // to avoid out-of-index exception int odd, even, alter; odd = ((n + 2) % 2) != 0; even = ((n + 2) % 2) == 0; // alter is used for alternating between the odd and even phases alter = 0; for(int i = 0; i < n; i++){ int idx = tid * 2 + alter; int adjust = alter == 0 ? odd : even; if(idx < (n - adjust)){ int f, s; f = out[idx]; s = out[idx + 1]; if (f > s){ out[idx] = s; out[idx + 1] = f; } } __syncthreads(); alter = 1 - alter; } } """ bricksort_ker = SourceModule(source=bricksort_src) bricksort = bricksort_ker.get_function("bricksort") np.random.seed(0) arr = np.random.randint(0,10,2**11).astype('int32') iar = gpuarray.to_gpu(arr) oar = gpuarray.empty_like(iar) n = iar.size num_threads = np.ceil(n/2) if (num_threads < 1024): blocksize = int(num_threads) gridsize = 1 else: blocksize = 1024 gridsize = int(np.ceil(num_threads / blocksize)) bricksort(iar, oar, np.int32(n), block=(blocksize,1,1), grid=(gridsize,1,1))

1条回答

网友

1楼 · 发布于 2024-05-14 07:58:10

将评论汇总成答案：

奇偶排序不能轻易地扩展到单个线程块之外（因为它需要同步），CUDA__syncthreads()只在块级别进行同步。如果没有同步，CUDA不会指定线程执行的特定顺序
对于严肃的排序工作，我建议使用cub之类的库实现。如果您想从python执行此操作，我建议使用cupy
CUDA有a sample code在块级别演示奇偶排序，但由于同步问题，它选择了合并方法来合并结果
应该可以编写一个奇偶排序内核，只进行一次交换，然后在循环中调用这个内核。内核调用本身充当设备范围的同步点
或者，应该可以使用cooperative groups grid sync在单个内核启动中完成这项工作
所有这些方法都不可能比一个好的库实现更快（一开始就不依赖奇偶排序）

相关问题更多 >

编程相关推荐

热门问题

热门文章