这是CUDA中的错误吗? (遇到非法内存访问)

时间:2016-09-21 13:01:49

标签: cuda

我正在使用以下CUDA内核:

__global__
void sum_worker(int *data, int *sum_ptr)
{
        __shared__ int block_sum;
        int idx = threadIdx.x;
        int thread_sum = 0;

        if (threadIdx.x == 0)
                block_sum = 2;

        for (int i = idx; i < MAX_INDEX; i += blockDim.x)
                thread_sum += data[i];

        __syncthreads();

        atomicAdd(&block_sum, thread_sum);

        __syncthreads();

        if (threadIdx.x == 0)
                *sum_ptr = block_sum;
}

使用此代码启动:

sum_worker<<<1, 32>>>(primes_or_zeros, sum_buffer);

它工作正常(没有运行时错误并产生正确的结果)。但是,如果我将i += blockDim.x更改为i += 32,则下次拨打cudaDeviceSynchronize()时会收到错误消息:

Cuda error 'an illegal memory access was encountered' in primes_gpu.cu at line 97

使用cuda-memcheck运行内核:

========= Invalid __global__ read of size 4
=========     at 0x00000108 in /home/clifford/Work/handicraft/2016/perfmeas/primes_gpu.cu:35:sum_worker(int*, int*)
=========     by thread (31,0,0) in block (0,0,0)
=========     Address 0x703b70d7c is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x472225]
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcudart.so.7.5 [0x146ad]
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcudart.so.7.5 (cudaLaunch + 0x143) [0x2ece3]
=========     Host Frame:./perfmeas [0x17c7]
=========     Host Frame:./perfmeas [0x16b7]
=========     Host Frame:./perfmeas [0x16e2]
=========     Host Frame:./perfmeas [0x153f]
=========     Host Frame:./perfmeas [0xdcd]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830]
=========     Host Frame:./perfmeas [0xf39]
....

地址0x703b70d7c确实超出data的范围:数组从0x703b40000开始并具有MAX_INDEX元素。在此测试中MAX_INDEX为50000。 (0x703b70d7c - 0x703b40000)/ 4 = 50015.

i >= 50000添加额外检查会让问题神奇地消失:

    for (int i = idx; i < MAX_INDEX; i += 32) {
            if (i >= MAX_INDEX)
                    printf("WTF!\n");
            thread_sum += data[i];
    }

这是CUDA中的错误还是我在这里做了些蠢事?

我在Ubuntu 2016.04上使用CUDA 7.5。输出nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2015 NVIDIA Corporation
Built on Tue_Aug_11_14:27:32_CDT_2015
Cuda compilation tools, release 7.5, V7.5.17

此测试用例的完整源代码可在此处找到:
http://svn.clifford.at/handicraft/2016/perfmeas

(使用选项-gx运行。此版本使用i += blockDim.x。将其更改为i += 32以重现此问题。)

编辑:@njuffa在评论中说他不想关注堆栈溢出链接,因为他“太害怕[他的]电脑可能会抓到一些东西”而且更喜欢他可以复制和粘贴堆栈的测试用例直接溢出。所以这就是:

#include <string.h>
#include <stdio.h>
#include <stdbool.h>
#include <math.h>

#define MAX_PRIMES 100000
#define MAX_INDEX (MAX_PRIMES/2)

__global__
void primes_worker(int *data)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    if (idx >= MAX_INDEX)
        return;

    int p = 2*idx+1;
    for (int i = 3; i*i <= p; i += 2) {
        if (p % i == 0) {
            data[idx] = 0;
            return;
        }
    }

    data[idx] = idx ? p : 0;
}

__global__
void sum_worker(int *data, int *sum_ptr)
{
    __shared__ int block_sum;
    int idx = threadIdx.x;
    int thread_sum = 0;

    if (threadIdx.x == 0)
        block_sum = 2;

#ifdef ENABLE_BUG
    for (int i = idx; i < MAX_INDEX; i += 32)
        thread_sum += data[i];
#else
    for (int i = idx; i < MAX_INDEX; i += blockDim.x)
        thread_sum += data[i];
#endif

    __syncthreads();

    atomicAdd(&block_sum, thread_sum);

    __syncthreads();

    if (threadIdx.x == 0)
        *sum_ptr = block_sum;
}

int *primes_or_zeros;
int *sum_buffer;

void primes_gpu_init()
{
    cudaError_t err;

    err = cudaMalloc((void**)&primes_or_zeros, sizeof(int)*MAX_INDEX);

    if (err != cudaSuccess)
        printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);

    err = cudaMallocHost((void**)&sum_buffer, sizeof(int));

    if (err != cudaSuccess)
        printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
}

void primes_gpu_done()
{
    cudaError_t err;

    err = cudaFree(primes_or_zeros);

    if (err != cudaSuccess)
        printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);

    err = cudaFreeHost(sum_buffer);

    if (err != cudaSuccess)
        printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
}

int primes_gpu()
{
    int num_blocks = (MAX_INDEX + 31) / 32;
    int num_treads = 32;

    primes_worker<<<num_blocks, num_treads>>>(primes_or_zeros);
    sum_worker<<<1, 32>>>(primes_or_zeros, sum_buffer);
    cudaError_t err = cudaDeviceSynchronize();

    if (err != cudaSuccess)
        printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);

    return *sum_buffer;
}

int main()
{
    primes_gpu_init();

    int result = primes_gpu();
    printf("Result: %d\n", result);

    if (result != 454396537) {
        printf("Incorrect result!\n");
        return 1;
    }

    primes_gpu_done();
    return 0;
}

用法:

$ nvcc -o demo demo.cu 
$ ./demo 
Result: 454396537

$ nvcc -D ENABLE_BUG -o demo demo.cu 
$ ./demo 
Cuda error 'an illegal memory access was encountered' in demo.cu at line 99
Result: 0
Incorrect result!

1 个答案:

答案 0 :(得分:4)

TL; DR :观察到的行为很可能是由CUDA 7.5工具链的ptxas组件中的错误引起的,特别是循环展开器。该错误可能已在CUDA 8.0 RC中修复,该版本已公开发布。

我能够在64位Windows 7平台上使用Quadro K2200 GPU(sm_50设备重现问题中报告的行为。生成的机器代码(SASS)与ENABLE_BUG定义的主要区别在于循环展开了四倍。这是循环增量从变量(即threadIdx.x)更改为编译时常量32的直接结果,它允许编译器在编译时计算行程计数。

值得注意的是,在中间PTX级别,即使增加32,循环也会滚动:

BB7_4:
ld.global.u32 %r12, [%rd10];
add.s32 %r16, %r12, %r16;
add.s64 %rd10, %rd10, 128;
add.s32 %r15, %r15, 32;
setp.lt.s32     %p3, %r15, 50000;
@%p3 bra BB7_4;

当循环在机器代码中展开时,它必须是应用该转换的ptxas展开器。

如果我将ptxas优化级别降低到-O1,通过在-Xptxas -O1命令行上指定nvcc,代码将按预期工作。如果我为sm_30构建代码(在sm_50设备上运行时导致JIT编译),则在运行最新驱动程序Windows 369.26时,代码将按预期工作。这强烈暗示CUDA 7.5的ptxas组件的展开器中存在一个错误,但是已经修复了该错误,因为CUDA驱动程序中的ptxas组件比{ptxas组件更新。 1}} CUDA 7.5工具链的组件。

#pragma unroll 4直接放在循环前面也可以解决问题,因为在这种情况下,展开是由编译器的nvvm组件执行的,这意味着展开的循环已经出现在PTX级别:

#if ENABLE_BUG
#pragma unroll 4
    for (int i = idx; i < MAX_INDEX; i += 32)
        thread_sum += data[i];
#else

产生的PTX:

BB7_5:
.pragma "nounroll";
ld.global.u32 %r34, [%rd14];
add.s32 %r35, %r34, %r45;
ld.global.u32 %r36, [%rd14+128];
add.s32 %r37, %r36, %r35;
ld.global.u32 %r38, [%rd14+256];
add.s32 %r39, %r38, %r37;
ld.global.u32 %r40, [%rd14+384];
add.s32 %r45, %r40, %r39;
add.s64 %rd14, %rd14, 512;
add.s32 %r44, %r44, 128;
setp.lt.s32     %p5, %r44, %r3;
@%p5 bra BB7_5;