将全局复制到共享内存

时间:2013-12-08 08:15:39

标签: c++ cuda

我想从全局内存复制到共享内存,我做了以下

__global__ void test(unsigned char *image, unsigned char *out, int n, int m)
{
        int x = threadIdx.x + blockIdx.x * blockDim.x;
        int y = threadIdx.y + blockIdx.y * blockDim.y;
        int index = x + y * blockDim.x * gridDim.x;


    __shared__ unsigned char shared [16*16*3];

    if (threadIdx.x < 256)

    {

    shared[threadIdx.x*3+0] = image[index*3+0];
    shared[threadIdx.x*3+1] = image[index*3+1];
    shared[threadIdx.x*3+2] = image[index*3+2];


    }

    __syncthreads();

    if (threadIdx.x < 256)

    {
    out[index*3+0] = shared[threadIdx.x*3+0];
    out[index*3+1] = shared[threadIdx.x*3+1];
    out[index*3+2] = shared[threadIdx.x*3+2];
    }

}

我有一个512x512图像,我以这种方式调用内核:

    out = (unsigned char*) malloc(n*m*3);
cudaMalloc( (void**)&dev_image, n*m*3);
cudaMalloc( (void**)&dev_out, n*m*3);
cudaMemcpy( dev_image, image, n*m*3, cudaMemcpyHostToDevice);
cudaMemcpy( dev_out, out, n*m*3, cudaMemcpyHostToDevice);

dim3 threads(16,16);
dim3 blocks(32, 32);

test<<<blocks, threads>>>(dev_image, dev_out, n, m);
cudaThreadSynchronize();

cudaMemcpy( out, dev_out, n*m*3, cudaMemcpyDeviceToHost );

知道我做错了什么吗?如何将全局内存的一部分复制到共享内存(一维)?

1 个答案:

答案 0 :(得分:3)

在你的内核中,检查threadIdx.x < 256这是错误的,因为threadIdx.x不能大于15.你必须检查16x16主题块内的索引。

我已将内核更改为:

__global__ void test(unsigned char *image, unsigned char *out, int n, int m)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int index = x + y * blockDim.x * gridDim.x;
    int blockIndex = threadIdx.x + threadIdx.y * blockDim.x;

    __shared__ unsigned char shared [16*16*3];

    if (blockIndex < 256 && index < n*m)
    {
        shared[blockIndex*3+0] = image[index*3+0];
        shared[blockIndex*3+1] = image[index*3+1];
        shared[blockIndex*3+2] = image[index*3+2];
    }

    __syncthreads();

    if (blockIndex < 256 && index < n*m)
    {
        out[index*3+0] = shared[blockIndex*3+0];
        out[index*3+1] = shared[blockIndex*3+1];
        out[index*3+2] = shared[blockIndex*3+2];
    }
}

此外,你永远不应该忘记你的内核中的边界检查(我已经添加了)以及针对内核和cuda api调用的正确cuda错误检查。