Question

这是我的压缩列格式稀疏矩阵乘法的代码

__kernel void mykernel(__global int* colvector,
                       __global int* val,
                       __global int* result,
                       __global int* index,
                       __global int* rowptr,
                       __global int* sync )
{   
    __local int vals[1000];
    for(int i=0;i<4;i++)
    {
        result[i]=0;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    barrier(CLK_GLOBAL_MEM_FENCE);

    const int items_per_row=32;//total threads working in a row

    const int thread_id=get_global_id(0)+get_local_id(0);//total threads in the program

    const int warpid = thread_id/items_per_row;//warp id is actual row

    int lane=thread_id&(items_per_row-1);//thread id within the warp

    int row = warpid;

    if(row<4)
    {
        int sum = 0;

        int row_start = rowptr[row];
        int row_end = rowptr[row+1];

        vals[get_global_id(0)]=0;
        barrier(CLK_LOCAL_MEM_FENCE);
        barrier(CLK_GLOBAL_MEM_FENCE);


        for (int i = row_start+lane; i<row_end; i+=items_per_row)
        {
            vals[get_local_id(0)]+=val[i]*colvector[index[i]];
        }

        barrier(CLK_LOCAL_MEM_FENCE);
        barrier(CLK_GLOBAL_MEM_FENCE);

        if (lane < 16 ) vals[get_local_id(0)] += vals[get_local_id(0) + 16];

        if (lane < 8 ) vals[get_local_id(0)] += vals[get_local_id(0) + 8];

        if (lane < 4 ) vals[get_local_id(0)] += vals[get_local_id(0) +4];

        if (lane < 2 ) vals[get_local_id(0)] += vals[get_local_id(0) + 2];

        if (lane < 1 ) vals[get_local_id(0)] += vals[get_local_id(0) + 1];

        barrier(CLK_LOCAL_MEM_FENCE);
        barrier(CLK_GLOBAL_MEM_FENCE);

        if(lane==0)
        {
            result[row] += vals[get_local_id(0)];
        }
    }
}

上面的OpenCL代码是从下面给出的CUDA代码转换而来的：

spmv_csr_vector_kernel(const int num_rows,
                       const int * ptr,
                       const int * indices,
                       const float * data,
                       const float * x,
                       float * y )
{
    __shared__ float vals[];

    int thread_id = blockDim.x * blockIdx.x + threadIdx.x; // global thread index

    int warp_id = thread_id / 32; // global warp index

    int lane = thread_id & (32 - 1); // thread index within the warp

    // one warp per row

    int row = warp_id;

    if (row < num_rows)
    {
        int row_start = ptr[row];

        int row_end = ptr[row+1];

        // compute running sum per thread

        vals[threadIdx.x] = 0;

        for(int jj = row_start + lane; jj < row_end; jj += 32)
        {
            vals[threadIdx.x] += data[jj] * x[indices[jj]];
        }
        // parallel reduction in shared memory

        if (lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16];

        if (lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8];

        if (lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4];

        if (lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2];

        if (lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1];

        // first thread writes the result

        if (lane == 0)
        {
            y[row] += vals[threadIdx.x];
        }
    }
}

CUDA代码是正确的但我的OpenCL内核没有返回正确的输出。我已经尝试了一个星期，但没有解决方案。有人知道我犯的是什么错误吗？

Answer 1

我至少可以看到一个错误。 thread_id在每个代码中都不相同。在OpenCL中CUDA中的blockDim.x * blockIdx.x + threadIdx.x == get_global_id（0），而不是get_global_id（0）+ get_local_id（0）。另外get_local_id（0）== threadIdx.x

Answer 2

尝试使用天鹅，这可能有助于您了解问题。

你可以找到一篇关于它的文章here。

从CUDA转换为OpenCL时我错过了什么？或者为什么我的内核返回的输出不同于串行代码

2 个答案: