指向主机结构中的设备数组的指针

时间:2017-04-01 19:11:45

标签: struct cuda

我正在尝试创建一个结构,它将主机和设备阵列都保存在一个位置,并且应该驻留在主机上。我后来打算将它扩展为链表的一个元素。基本结构如下所示:

WHERE

其中* h指向主机上的双精度数组,* d指向设备上的双精度数组。

关于将整个结构复制到设备(CUDA cudaMemcpy Struct of Arrays),有各种各样的答案,但没有一个完全符合我的需要。我有以下代码,但不断收到非法内存访问错误。

typedef struct Data{
    double *h;
    double *d;
} Data;

我得到的输出是:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "cuda.h"

/*
* CUDA Error stuff
*/

static void HandleError( cudaError_t err,
                         const char *file,
                         int line ) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


#define HANDLE_NULL( a ) {if (a == NULL) { \
                            printf( "Host memory failed in %s at line %d\n", \
                                    __FILE__, __LINE__ ); \
                            exit( EXIT_FAILURE );}}

//malloc error code
int errMsg(const char *message, int errorCode)
{
    printf("%s\n", message);
    return errorCode;
}

typedef struct Data{
    double *h;
    double *d;
} Data;

__global__ void kernel(Data *d)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if(tid<100){
        d->d[tid] = 2;
    }

}

int main()
{
    Data *d;
    d = (Data*)malloc(sizeof(Data));

    d->h = (double*)malloc(sizeof(double)*100);
    HANDLE_ERROR( cudaMalloc((void**) &(d->d), 100*sizeof(double)) );

    for(int i=0; i<100; i++){
        d->h[i] = i;
    }

    HANDLE_ERROR( cudaMemcpy(d->d, d->h, 100*sizeof(double), cudaMemcpyHostToDevice) );

    printf("%f\n", d->h[1]);

    kernel<<<1, 102>>>(d);

    printf("done\n");

    {
    cudaError_t cudaerr = cudaDeviceSynchronize();
    if (cudaerr != cudaSuccess)
        printf("kernel launch failed with error \"%s\"->\n",
               cudaGetErrorString(cudaerr));
    }

    HANDLE_ERROR( cudaMemcpy(d->h, d->d, 100*sizeof(double), cudaMemcpyDeviceToHost) );
    printf("%f\n", d->h[99]);


    return 0;
}

我怀疑我刚刚搞砸了我的指针。错误处理代码来自Wiley对CUDA书籍的介绍,如果此处不允许使用代码,我将删除它。

感谢。

1 个答案:

答案 0 :(得分:1)

问题是d本身是指向主机分配结构的指针(其中包含dh指针。当你传递d结构指针时像这样的内核:

kernel<<<1, 102>>>(d);
                   ^
                   this is a pointer to memory on the host

然后尝试在设备代码中取消引用该指针:

    d->...;
     ^ 
     This operator dereferences the pointer to the left of it

你获得了非法的内存访问权。

至少有两种明显的方法可以解决这个问题:

  1. 按值而不是指针传递结构。
  2. 以下是一个例子:

    $ cat t1311.cu
    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    #include "cuda.h"
    
    /*
    * CUDA Error stuff
    */
    
    static void HandleError( cudaError_t err,
                             const char *file,
                             int line ) {
        if (err != cudaSuccess) {
            printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                    file, line );
            exit( EXIT_FAILURE );
        }
    }
    #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
    
    
    #define HANDLE_NULL( a ) {if (a == NULL) { \
                                printf( "Host memory failed in %s at line %d\n", \
                                        __FILE__, __LINE__ ); \
                                exit( EXIT_FAILURE );}}
    
    //malloc error code
    int errMsg(const char *message, int errorCode)
    {
        printf("%s\n", message);
        return errorCode;
    }
    
    typedef struct Data{
        double *h;
        double *d;
    } Data;
    
    __global__ void kernel(Data d)
    {
        int tid = blockIdx.x * blockDim.x + threadIdx.x;
        if(tid<100){
            d.d[tid] = 2;
        }
    
    }
    
    int main()
    {
        Data d;
    
        d.h = (double*)malloc(sizeof(double)*100);
        HANDLE_ERROR( cudaMalloc((void**) &(d.d), 100*sizeof(double)) );
    
        for(int i=0; i<100; i++){
            d.h[i] = i;
        }
    
        HANDLE_ERROR( cudaMemcpy(d.d, d.h, 100*sizeof(double), cudaMemcpyHostToDevice) );
    
        printf("%f\n", d.h[1]);
    
        kernel<<<1, 102>>>(d);
    
        printf("done\n");
    
        {
        cudaError_t cudaerr = cudaDeviceSynchronize();
        if (cudaerr != cudaSuccess)
            printf("kernel launch failed with error \"%s\"->\n",
                   cudaGetErrorString(cudaerr));
        }
    
        HANDLE_ERROR( cudaMemcpy(d.h, d.d, 100*sizeof(double), cudaMemcpyDeviceToHost) );
        printf("%f\n", d.h[99]);
    
    
        return 0;
    }
    $ nvcc -arch=sm_35 -o t1311 t1311.cu
    $ cuda-memcheck ./t1311
    ========= CUDA-MEMCHECK
    1.000000
    done
    2.000000
    ========= ERROR SUMMARY: 0 errors
    $
    
    1. 制作d主机指针指向的结构的设备副本:
    2. 以下是一个例子:

      $ cat t1311.cu
      #include <stdio.h>
      #include <stdlib.h>
      #include <math.h>
      #include "cuda.h"
      
      /*
      * CUDA Error stuff
      */
      
      static void HandleError( cudaError_t err,
                               const char *file,
                               int line ) {
          if (err != cudaSuccess) {
              printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                      file, line );
              exit( EXIT_FAILURE );
          }
      }
      #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
      
      
      #define HANDLE_NULL( a ) {if (a == NULL) { \
                                  printf( "Host memory failed in %s at line %d\n", \
                                          __FILE__, __LINE__ ); \
                                  exit( EXIT_FAILURE );}}
      
      //malloc error code
      int errMsg(const char *message, int errorCode)
      {
          printf("%s\n", message);
          return errorCode;
      }
      
      typedef struct Data{
          double *h;
          double *d;
      } Data;
      
      __global__ void kernel(Data *d)
      {
          int tid = blockIdx.x * blockDim.x + threadIdx.x;
          if(tid<100){
              d->d[tid] = 2;
          }
      
      }
      
      int main()
      {
          Data *d, *dev_d;
          d = (Data*)malloc(sizeof(Data));
          HANDLE_ERROR(cudaMalloc(&dev_d, sizeof(Data)));
          d->h = (double*)malloc(sizeof(double)*100);
          HANDLE_ERROR( cudaMalloc((void**) &(d->d), 100*sizeof(double)) );
      
          for(int i=0; i<100; i++){
              d->h[i] = i;
          }
      
          HANDLE_ERROR( cudaMemcpy(d->d, d->h, 100*sizeof(double), cudaMemcpyHostToDevice) );
          HANDLE_ERROR(cudaMemcpy(dev_d, d, sizeof(Data), cudaMemcpyHostToDevice));
          printf("%f\n", d->h[1]);
      
          kernel<<<1, 102>>>(dev_d);
      
          printf("done\n");
      
          {
          cudaError_t cudaerr = cudaDeviceSynchronize();
          if (cudaerr != cudaSuccess)
              printf("kernel launch failed with error \"%s\"->\n",
                     cudaGetErrorString(cudaerr));
          }
      
          HANDLE_ERROR( cudaMemcpy(d->h, d->d, 100*sizeof(double), cudaMemcpyDeviceToHost) );
          printf("%f\n", d->h[99]);
      
      
          return 0;
      }
      $ nvcc -arch=sm_35 -o t1311 t1311.cu
      $ cuda-memcheck ./t1311
      ========= CUDA-MEMCHECK
      1.000000
      done
      2.000000
      ========= ERROR SUMMARY: 0 errors
      $
      

      顺便说一下,您可以按照here概述的方法进行更远的调试过程。