减少不能给出一致的结果

时间:2016-01-29 16:38:07

标签: cuda reduce

我正在尝试为大型1D阵列实现自己的减少总和。我提出了一个reduce内核和一种调用内核几次的方法,逐步减少以达到单个值。现在我知道这不是计算这个的最佳方式(如果你看到我的代码可能会达到需要进行内核调用以添加3个值的程度),但让我们暂时避免这种情况。并尝试使用它。

基本上,简而言之:我每次调用reduce内核减少MAXTHREADS次,在本例中为1024.因此每次减少1024的数组大小。当大小小于1024时,似乎工作正常,但是对于更大的值,它很难得到正确的总和。

我尝试的所有数组大小都会发生这种情况。我错过了什么?

我也很乐意接受有关代码质量的评论,但主要是有兴趣修复它。

下面我发布了整个内核和内核调用的片段。如果我错过了内核调用的一些相关部分,请发表评论,我会修复它。 原始代码具有错误检查,所有内核始终运行并且永远不会返回CUDAError s。

减少内核

__global__ void  reduce6(float *g_idata, float *g_odata, unsigned int n){
    extern __shared__ float sdata[];

    // perform first level of reduction,
    // reading from global memory, writing to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(MAXTREADS*2) + tid;
    unsigned int gridSize = MAXTREADS*2*gridDim.x;
    //blockSize==MAXTHREADS
    sdata[tid] = 0;
    float mySum = 0;   
    if (tid>=n) return; 
    if ( (gridDim.x>0) & ((i+MAXTREADS)<n)){
        while (i < n) { 
            sdata[tid] += g_idata[i] + g_idata[i+MAXTREADS]; 
            i += gridSize; 
        }
    }else{
        sdata[tid] += g_idata[i];
    }
   __syncthreads();


    // do reduction in shared mem
   if (tid < 512)
        sdata[tid] += sdata[tid + 512];
    __syncthreads();
    if (tid < 256)
        sdata[tid] += sdata[tid + 256];
    __syncthreads();

    if (tid < 128)
        sdata[tid] += sdata[tid + 128];
     __syncthreads();

    if (tid <  64)
       sdata[tid]  += sdata[tid +  64];
    __syncthreads();


#if (__CUDA_ARCH__ >= 300 )
    if ( tid < 32 )
    {
        // Fetch final intermediate sum from 2nd warp
        mySum = sdata[tid]+ sdata[tid + 32];
        // Reduce final warp using shuffle
        for (int offset = warpSize/2; offset > 0; offset /= 2) 
            mySum += __shfl_down(mySum, offset);
    }
    sdata[0]=mySum;
#else

    // fully unroll reduction within a single warp
    if (tid < 32) {
        warpReduce(sdata,tid);
    }
#endif
    // write result for this block to global mem
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}



__device__ void warpReduce(volatile float *sdata,unsigned int tid) {
    sdata[tid] += sdata[tid + 32];
    sdata[tid] += sdata[tid + 16];
    sdata[tid] += sdata[tid + 8];
    sdata[tid] += sdata[tid + 4];
    sdata[tid] += sdata[tid + 2];
    sdata[tid] += sdata[tid + 1];
}

调用内核以获得任意大小的total_pixels

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>    
#defineMAXTREADS 1024
__global__ void initKernel(float * devPtr, const float val, const size_t nwords)
{
    //https://stackoverflow.com/questions/10589925/initialize-device-array-in-cuda
    int tidx = threadIdx.x + blockDim.x * blockIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (; tidx < nwords; tidx += stride)
        devPtr[tidx] = val;
} 


int main()
{

size_t total_pixels = 5000;
unsigned long int n = (unsigned long int)total_pixels;
float* d_image, *d_aux;
float sum;
cudaMalloc(&d_image, total_pixels*sizeof(float));
cudaMalloc(&d_aux, sizeof(float)*(n + MAXTREADS - 1) / MAXTREADS);


for (int i = 0; i < 10; i++){
    sum = 0;
    cudaMemset(&d_image, 1, total_pixels*sizeof(float));

    int dimblockRed = MAXTREADS;
    int dimgridRed = (total_pixels + MAXTREADS - 1) / MAXTREADS;
    int reduceCont = 0;
    initKernel << < dimgridRed, dimblockRed >> >(d_image, 1.0, total_pixels);

    while (dimgridRed > 1) {
        if (reduceCont % 2 == 0){
            reduce6 << <dimgridRed, dimblockRed, MAXTREADS*sizeof(float) >> >(d_image, d_aux, n);
        }
        else{
            reduce6 << <dimgridRed, dimblockRed, MAXTREADS*sizeof(float) >> >(d_aux, d_image, n);
        }
        n = dimgridRed;
        dimgridRed = (n + MAXTREADS - 1) / MAXTREADS;
        reduceCont++;

    }
    if (reduceCont % 2 == 0){
        reduce6 << <1, dimblockRed, MAXTREADS*sizeof(float) >> >(d_image, d_aux, n);
        cudaMemcpy(&sum, d_aux, sizeof(float), cudaMemcpyDeviceToHost);

    }
    else{
        reduce6 << <1, dimblockRed, MAXTREADS*sizeof(float) >> >(d_aux, d_image, n);
        cudaMemcpy(&sum, d_image, sizeof(float), cudaMemcpyDeviceToHost);
    }
    printf("%f ", sum);
}
cudaDeviceReset();
return 0;
}

1 个答案:

答案 0 :(得分:1)

这里的主机和设备代码都有批次,我不会尝试解决所有问题。但我至少可以看到:

extern __shared__ float sdata[]; // must be declared volatile for the warp reduct to work reliably

这个累积代码在很多方面被打破,但至少是:

if (tid>=n) return; // undefined behaviour with __syncthreads() below
if ( (gridDim.x>0) & ((i+MAXTREADS)<n)){
    while (i < n) { 
        sdata[tid] += g_idata[i] + g_idata[i+MAXTREADS]; // out of bounds if i > n - MAXTREADS
        i += gridSize; 
    }
}else{
    sdata[tid] += g_idata[i]; // out of bounds if i > n
}
__syncthreads(); // potential deadlock 

基于shuffle的缩减也不正确:

if ( tid < 32 )
{
    // Fetch final intermediate sum from 2nd warp
    mySum = sdata[tid]+ sdata[tid + 32];
    // Reduce final warp using shuffle
    for (int offset = warpSize/2; offset > 0; offset /= 2) 
        mySum += __shfl_down(mySum, offset);
}
sdata[0]=mySum; // must be conditionally executed only by thread 0, otherwise a memory race

调用缩减内核的主机代码是个完全谜。还原内核最多只需要调用两次,因此循环是多余的。在缩减的最后阶段,你会像这样调用内核:

reduce6 << <1, dimblockRed, MAXTREADS*sizeof(float) >> >(d_aux, d_image, n);

d_aux最多只有dimgridRed个条目,因此也是内存溢出。

我认为你真正想要的是这样的:

#include <cstdio>    
#include <assert.h>
#define MAXTHREADS 1024    


__device__ void warpReduce(volatile float *sdata,unsigned int tid) {
    sdata[tid] += sdata[tid + 32];
    sdata[tid] += sdata[tid + 16];
    sdata[tid] += sdata[tid + 8];
    sdata[tid] += sdata[tid + 4];
    sdata[tid] += sdata[tid + 2];
    sdata[tid] += sdata[tid + 1];
}


__global__ void mymemset(float* image, const float val, unsigned int N)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    while (tid < N) {
        image[tid] = val;
        tid += gridDim.x * blockDim.x;
    }
}

__global__ void  reduce6(float *g_idata, float *g_odata, unsigned int n){
    extern __shared__ volatile float sdata[];

    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*blockDim.x + tid;
    unsigned int gridSize = blockDim.x*gridDim.x;
    float mySum = 0;   
    while (i < n) { 
        mySum += g_idata[i]; 
        i += gridSize; 
    }
    sdata[tid] = mySum;
    __syncthreads();

   if (tid < 512)
        sdata[tid] += sdata[tid + 512];
    __syncthreads();
    if (tid < 256)
        sdata[tid] += sdata[tid + 256];
    __syncthreads();

    if (tid < 128)
        sdata[tid] += sdata[tid + 128];
     __syncthreads();

    if (tid <  64)
       sdata[tid]  += sdata[tid +  64];
    __syncthreads();


#if (__CUDA_ARCH__ >= 300)
    if ( tid < 32 )
    {
        mySum = sdata[tid] + sdata[tid + 32];
        for (int offset = warpSize/2; offset > 0; offset /= 2) {
            mySum += __shfl_down(mySum, offset);
        }
    }
#else
    if (tid < 32) {
        warpReduce(sdata,tid);
        mySum = sdata[0];
    }
#endif
    if (tid == 0) g_odata[blockIdx.x] = mySum;
}


int main()
{
    size_t total_pixels = 8000;
    unsigned long int n = (unsigned long int)total_pixels;
    float* d_image, *d_aux;
    cudaMalloc(&d_image, total_pixels*sizeof(float));
    cudaMalloc(&d_aux, sizeof(float)*(n + MAXTHREADS - 1) / MAXTHREADS);

    for (int i = 0; i < 10000; i++){
        {
            dim3 bsz = dim3(1024);
            dim3 gsz = dim3(total_pixels / bsz.x + ((total_pixels % bsz.x > 0) ? 1: 0));
            mymemset<<<gsz, bsz>>>(d_image, 1.0f, total_pixels);
            cudaDeviceSynchronize();
        }

        int dimblockRed = MAXTHREADS;
        int dimgridRed = (n + MAXTHREADS - 1) / MAXTHREADS;
        float sum;
        reduce6<<<dimgridRed, dimblockRed, MAXTHREADS*sizeof(float)>>>(d_image, d_aux, n);

        if (dimgridRed > 1) {
            reduce6<<<1, dimblockRed, MAXTHREADS*sizeof(float)>>>(d_aux, d_image, dimgridRed);
            cudaMemcpy(&sum, d_image, sizeof(float), cudaMemcpyDeviceToHost);
        } else {
            cudaMemcpy(&sum, d_aux, sizeof(float), cudaMemcpyDeviceToHost);
        }
        assert(sum == float(total_pixels));
    }
    cudaDeviceReset();
    return 0;
}

[供将来参考,这就是MCVE的样子]

但是我不会再浪费时间去破解内核和主机代码在累积阶段的扭曲逻辑。还有其他一些事情需要修复(网格大小只需要与设备上适合的最大并发块数一样大),但我将其作为练习留给读者。