cudamemcpyasync,当直接复制有效时,memcpy无法在内核内部复制

时间:2019-09-10 07:42:21

标签: cuda memcpy

我正在尝试从cuda内核中的源float数组(包含1.0f)复制到目标float数组(包含2.0f)。我尝试使用三种不同的方式:

  • cudamemcpysync
  • memcpy
  • 直接复制(dst [i] = src [i])

当我在执行内核后读取结果时,我发现在使用直接复制方法时,cudamemcpyasync和memcpy均无法复制。

为什么cudamemcpyasync和memcpy方法失败?

我正在使用GTX TitanX(SM_52)。

使用以下命令编译:nvcc -arch = compute_52 main.cu

main.cu:

#include <stdio.h>
#include <iostream>


__global__
void cudamemcpy_inside_kernel(float *src, float *dst, int size)
{
  int idx = blockIdx.x*blockDim.x + threadIdx.x;

    if(idx < size){
//        memcpy(dst +idx*sizeof(float), src + idx*sizeof(float), 1); // FAILS TO COPY
//        cudaMemcpyAsync(dst +idx*sizeof(float), src + idx*sizeof(float), 1, cudaMemcpyDeviceToDevice); // FAILS TO COPY
//          dst[idx] = src[idx]; // COPIES SUCCESSFULLY
    }

}

int current = 0;
int UniqueNumber () { return ++current; }

int main(void)
{
  int N = 1000;

  float *x, *y, *d_x, *d_y;
  x = (float*)malloc(N*sizeof(float));
  y = (float*)malloc(N*sizeof(float));



  cudaMalloc(&d_x, N*sizeof(float)); 
  cudaMalloc(&d_y, N*sizeof(float));


  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);


//  cudamemcpy_inside_kernel<<<(N+255)/256, 256>>>(d_x, d_y, N);
  cudamemcpy_inside_kernel<<<2, 512>>>(d_x, d_y, N);
  cudaDeviceSynchronize();

  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();

  for (int i = 0; i < N; i++)
     printf(" %f\n", y[i]); // y[i] should have all 1.0f

}

1 个答案:

答案 0 :(得分:3)

两个memcpy调用中的source,destination和size参数都错误。像这样:

#include <stdio.h>
#include <iostream>

template<int action>
__global__
void cudamemcpy_inside_kernel(float *src, float *dst, int size)
{
  int idx = blockIdx.x*blockDim.x + threadIdx.x;
  if(idx < size)
    switch(action) {
      case 1:
        memcpy(dst+idx, src+idx, sizeof(float));
        break;
      case 2:
        cudaMemcpyAsync(dst+idx, src+idx, sizeof(float), cudaMemcpyDeviceToDevice);
        break;
      default:
        dst[idx] = src[idx];
    }
}

int main(void)
{
  int N = 10;

  float *x, *y, *d_x, *d_y;
  x = (float*)malloc(N*sizeof(float));
  y = (float*)malloc(N*sizeof(float));

  cudaMalloc(&d_x, N*sizeof(float)); 
  cudaMalloc(&d_y, N*sizeof(float));


  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }
  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);

  printf("Assignment \n");
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
  cudamemcpy_inside_kernel<0><<<(N+255)/256, 256>>>(d_x, d_y, N);
  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < N; i++)
     printf(" %f\n", y[i]);

  printf("\n Memcpy \n");
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
  cudamemcpy_inside_kernel<1><<<(N+255)/256, 256>>>(d_x, d_y, N);
  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < N; i++)
     printf(" %f\n", y[i]);

  printf("\n cudaMemcpyAsync \n");
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
  cudamemcpy_inside_kernel<2><<<(N+255)/256, 256>>>(d_x, d_y, N);
  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < N; i++)
     printf(" %f\n", y[i]);

  cudaFree(d_x);
  cudaFree(d_y);
  free(x);
  free(y);
}

将按预期工作:

$ nvcc -arch=sm_52 -dc -o memcpy_kernel.o memcpy_kernel.cu
$ nvcc -arch=sm_52 -o memcpy_kernel memcpy_kernel.o
$ ./memcpy_kernel 
Assignment 
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000

 Memcpy 
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000

 cudaMemcpyAsync 
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
相关问题