检查失败:错误== cudaSuccess(77与0)遇到非法内存访问

时间:2017-07-03 13:50:46

标签: cuda

我正在调试一些涉及一些cuda操作的冗长代码。 我在调用cudaMemcpy(...,...,cudaMemcpyHostToDevice)时遇到上述错误,但我不确定它是否与此有特殊关系。

以下是代码段:

    int num_elements = 8294400; // --> I also tried it with "1" here which didn't work either!
    float *checkArray = new float[num_elements];
    float *checkArray_GPU;
    CUDA_CHECK(cudaMalloc(&checkArray_GPU, num_elements * sizeof(float)));
    CUDA_CHECK(cudaMemcpy(checkArray_GPU, checkArray, num_elements * sizeof(float), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(checkArray, checkArray_GPU, num_elements * sizeof(float), cudaMemcpyDeviceToHost));    

其中CUDA_CHECK只是一个用于打印任何cuda错误的宏(这是现有代码的一部分,适用于所有其他cudaMemcpy oder cudaMalloc调用,因此它不是问题的一部分)。奇怪的是,这个在玩具*.cu示例中单独执行的代码段工作正常。

所以我的假设是,由于程序中先前的cuda操作,有一些错误未被报告导致上面的代码片段中的错误。可能是吗? 有没有办法检查是否存在涉及cuda的未报告错误?

我的另一个估计是,它可能来自我正在使用的特定显卡。我有一个Nvidia Titan X Pascal,Cuda 8.0和cudnn v5.1。我还尝试使用some special compiler flags编译我的代码,如

-arch=sm_30 \
 -gencode=arch=compute_20,code=sm_20 \
 -gencode=arch=compute_30,code=sm_30 \
 -gencode=arch=compute_50,code=sm_50 \
 -gencode=arch=compute_52,code=sm_52 \
 -gencode=arch=compute_52,code=compute_52 \
 -gencode=arch=compute_60,code=sm_60 \
 -gencode=arch=compute_61,code=sm_61 \
 -gencode=arch=compute_62,code=sm_62 \

但到目前为止它没有帮助。这是我当前的简化Makefile的完整性:

NVCC = nvcc
CUDA_INC = -I/usr/local/cuda/include 
CUDA_LIB = -L/usr/local/cuda/lib64
TARGET = myProgramm
OPTS = -std=c++11
$(TARGET).so: $(TARGET).o
    $(NVCC) $(OPTS) -shared $(TARGET).o $(CUDA_LIB) -o $(TARGET).so
$(TARGET).o: $(TARGET).cu headers/some_header.hpp 
    $(NVCC) $(OPTS) $(CUDA_INC) -Xcompiler -fPIC -c $(TARGET).cu 

有谁知道如何才能找到底线?

修改
cuda-memcheck是一个好主意,因此在Kernel_set_value的调用期间,错误会更早发生:

========= Invalid __global__ write of size 4
=========     at 0x00000298 in void Kernel_set_value<float>(unsigned long, unsigned long, float*, float)
=========     by thread (480,0,0) in block (30,0,0)
=========     Address 0x0005cd00 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x209035]
[...]
=========     Host Frame:/media/.../myProgramm.so (_ZN5boost6python6detail6invokeIiPFvRKSsENS0_15arg_from_pythonIS4_EEEEP7_objectNS1_11invoke_tag_ILb1ELb0EEERKT_RT0_RT1_ + 0x2d) [0x3e5eb]
[...]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaMemcpy. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2f4e33]
=========     Host Frame:/media/.../myProgramm.so [0x7489f]
F0703 16:23:54.840698 26207 myProgramm.cu:411] Check failed: error == cudaSuccess (4 vs. 0)  unspecified launch failure
[...]
=========     Host Frame:python (Py_Main + 0xb5e) [0x66d92]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
=========     Host Frame:python [0x177c2e]
=========
*** Check failure stack trace: ***
========= Error: process didn't terminate successfully
========= Internal error (20)
========= No CUDA-MEMCHECK results found

但是函数Kernel_set_value在玩具示例中也可以正常工作。使用Kernel_set_value时是否有任何特殊需要考虑。这是它的源代码,它是各自的辅助函数。

#define CUDA_NUM_THREADS 512
#define MAX_NUM_BLOCKS 2880

inline int CUDA_GET_BLOCKS(const size_t N) {
  return min(MAX_NUM_BLOCKS, int((N + size_t(CUDA_NUM_THREADS) - 1) / CUDA_NUM_THREADS));
}

inline size_t CUDA_GET_LOOPS(const size_t N) {
  size_t total_threads = CUDA_GET_BLOCKS(N)*CUDA_NUM_THREADS;
  return (N + total_threads -1)/ total_threads;
}

template <typename Dtype>
__global__ void Kernel_set_value(size_t CUDA_NUM_LOOPS, size_t N, Dtype* GPUdst, Dtype value){
  const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x));
  if (idxBase >= N) return;
  for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){
    GPUdst[idx] = value;
  }
}

1 个答案:

答案 0 :(得分:-1)

所以最终的解决方案是编译没有任何-gencode=arch=compute_XX,code=sm_XX - 样式标志的代码。让我永远找到这个。实际的错误代码非常具有误导性(error == cudaSuccess (77 vs. 0) an illegal memory access(4 vs. 0) unspecified launch failure(8 vs. 0) invalid device function