Question

好的，让我说我有一个（N x N）矩阵，我想处理。这个矩阵对于我的计算机而言非常大，如果我尝试将其一次性发送到设备，我会收到“内存不足错误”。

那么有没有办法将矩阵的部分发送到设备？我可以看到的一种方法是在主机上复制矩阵的部分，然后将这些可管理的复制部分从主机发送到设备，然后在最后将它们重新组合在一起。

这是我尝试过的，但for循环中的cudaMemcpy返回错误代码11，'无效参数'。

int h_N = 10000;
size_t h_size_m = h_N*sizeof(float);
h_A  = (float*)malloc(h_size_m*h_size_m);

int d_N = 2500;
size_t d_size_m = d_N*sizeof(float);

InitializeMatrices(h_N);

int i;
int iterations = (h_N*h_N)/(d_N*d_N);

for( i = 0; i < iterations; i++ ) 
{
    float* h_array_ref = h_A+(i*d_N*d_N);
    cudasafe( cudaMemcpy(d_A, h_array_ref, d_size_m*d_size_m, cudaMemcpyHostToDevice), "cudaMemcpy");
    cudasafe( cudaFree(d_A), "cudaFree(d_A)" );
}

我想用上面的代码完成的是：不是将整个矩阵发送到设备，而是简单地将指针发送到该矩阵内的某个位置，并在设备上保留足够的空间来完成工作，然后在循环的下一次迭代中将指针向前移动到矩阵等中等。

Answer 1

不仅可以你这样做（假设你的问题很容易以这种方式分解成子数组），它对性能来说是一件非常有用的事情;一旦你得到了你所描述的基本方法，你可以开始使用asynchronous memory copies和双缓冲来重叠一些内存传输时间和计算已经存在卡上的时间。

但是第一个让简单的事情发挥作用。下面是一个示例（将矢量乘以标量并添加另一个标量），但使用线性化的2d数组将是相同的;关键部分是

CHK_CUDA( cudaMalloc(&xd, batchsize*sizeof(float)) );
CHK_CUDA( cudaMalloc(&yd, batchsize*sizeof(float)) );
tick(&gputimer);

int nbatches = 0;
for (int nstart=0; nstart < n; nstart+=batchsize) {

    int size=batchsize;
    if ((nstart + batchsize) > n) size = n - nstart;

    CHK_CUDA( cudaMemcpy(xd, &(x[nstart]), size*sizeof(float), cudaMemcpyHostToDevice) );

    blocksize = (size+nblocks-1)/nblocks;
    cuda_saxpb<<<nblocks, blocksize>>>(xd, a, b, yd, size);

    CHK_CUDA( cudaMemcpy(&(ycuda[nstart]), yd, size*sizeof(float), cudaMemcpyDeviceToHost) );

    nbatches++;
}
gputime = tock(&gputimer);

CHK_CUDA( cudaFree(xd) );
CHK_CUDA( cudaFree(yd) );

您在开始时分配缓冲区，然后循环直到完成，每次执行复制，启动内核，然后复制回来。你最后自由了。

完整的代码是

#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <cuda.h>
#include <sys/time.h>
#include <math.h>

#define CHK_CUDA(e) {if (e != cudaSuccess) {fprintf(stderr,"Error: %s\n", cudaGetErrorString(e)); exit(-1);}}

__global__ void cuda_saxpb(const float *xd, const float a, const float b,
                           float *yd, const int n) {

    int i = threadIdx.x + blockIdx.x*blockDim.x;
    if (i<n) {
        yd[i] = a*xd[i]+b;
    }
    return;
}

void cpu_saxpb(const float *x, float a, float b, float *y, int n) {

    int i;
    for (i=0;i<n;i++) {
        y[i] = a*x[i]+b;
    }
    return;
}

int get_options(int argc, char **argv, int *n, int *s, int *nb, float *a, float *b);
void tick(struct timeval *timer);
double tock(struct timeval *timer);

int main(int argc, char **argv) {
    int n=1000;
    int nblocks=10;
    int batchsize=100;
    float a = 5.;
    float b = -1.;
    int err;
    float *x, *y, *ycuda;
    float *xd, *yd;
    double abserr;
    int blocksize;
    int i;
    struct timeval cputimer;
    struct timeval gputimer;
    double cputime, gputime;

    err = get_options(argc, argv, &n, &batchsize, &nblocks, &a, &b);
    if (batchsize > n) {
        fprintf(stderr, "Resetting batchsize to size of vector, %d\n", n);
        batchsize = n;
    }
    if (err) return 0;

    x = (float *)malloc(n*sizeof(float));
    if (!x) return 1;

    y = (float *)malloc(n*sizeof(float));
    if (!y) {free(x); return 1;}

    ycuda = (float *)malloc(n*sizeof(float));
    if (!ycuda) {free(y); free(x); return 1;}

    /* run CPU code */

    tick(&cputimer);
    cpu_saxpb(x, a, b, y, n);
    cputime = tock(&cputimer);

    /* run GPU code */

    /* only have to allocate once */
    CHK_CUDA( cudaMalloc(&xd, batchsize*sizeof(float)) );
    CHK_CUDA( cudaMalloc(&yd, batchsize*sizeof(float)) );
    tick(&gputimer);

    int nbatches = 0;
    for (int nstart=0; nstart < n; nstart+=batchsize) {

        int size=batchsize;
        if ((nstart + batchsize) > n) size = n - nstart;

        CHK_CUDA( cudaMemcpy(xd, &(x[nstart]), size*sizeof(float), cudaMemcpyHostToDevice) );

        blocksize = (size+nblocks-1)/nblocks;
        cuda_saxpb<<<nblocks, blocksize>>>(xd, a, b, yd, size);

        CHK_CUDA( cudaMemcpy(&(ycuda[nstart]), yd, size*sizeof(float), cudaMemcpyDeviceToHost) );

        nbatches++;
    }
    gputime = tock(&gputimer);

    CHK_CUDA( cudaFree(xd) );
    CHK_CUDA( cudaFree(yd) );

    abserr = 0.;
    for (i=0;i<n;i++) {
        abserr += fabs(ycuda[i] - y[i]);
    }

    printf("Y = a*X + b, problemsize = %d\n", n);
    printf("CPU time = %lg millisec.\n", cputime*1000.);
    printf("GPU time = %lg millisec (done with %d batches of %d).\n",
                  gputime*1000., nbatches, batchsize);
    printf("CUDA and CPU results differ by %lf\n", abserr);

    free(x);
    free(y);
    free(ycuda);
    return 0;
}


int get_options(int argc, char **argv, int *n, int *s, int *nb, float *a, float *b) {

  const struct option long_options[] = {
    {"nvals"     , required_argument, 0, 'n'},
    {"nblocks"   , required_argument, 0, 'B'},
    {"batchsize" , required_argument, 0, 's'},
    {"a", required_argument, 0, 'a'},
    {"b", required_argument, 0, 'b'},
    {"help",      no_argument, 0, 'h'},
    {0, 0, 0, 0}};

  char c;
  int option_index;
  int tempint;

  while (1) {
    c = getopt_long(argc, argv, "n:B:a:b:s:h", long_options, &option_index);
    if (c == -1) break;

    switch(c) {
      case 'n': tempint = atoi(optarg);
          if (tempint < 1 || tempint > 500000) {
            fprintf(stderr,"%s: Cannot use number of points %s;\n  Using %d\n", argv[0], optarg, *n);
          } else {
            *n = tempint;
          }
          break;

      case 's': tempint = atoi(optarg);
          if (tempint < 1 || tempint > 50000) {
            fprintf(stderr,"%s: Cannot use number of points %s;\n  Using %d\n", argv[0], optarg, *s);
          } else {
            *s = tempint;
          }
          break;

      case 'B': tempint = atoi(optarg);
          if (tempint < 1 || tempint > 1000 || tempint > *n) {
            fprintf(stderr,"%s: Cannot use number of blocks %s;\n  Using %d\n", argv[0], optarg, *nb);
          } else {
            *nb = tempint;
          }
          break;

      case 'a': *a = atof(optarg);
          break;

      case 'b': *b = atof(optarg);
          break;

      case 'h':
          puts("Calculates y[i] = a*x[i] + b on the GPU.");
          puts("Options: ");
          puts("    --nvals=N      (-n N): Set the number of values in y,x.");
          puts("    --batchsize=N  (-s N): Set the number of values to transfer at a time.");
          puts("    --nblocks=N    (-B N): Set the number of blocks used.");
          puts("    --a=X          (-a X): Set the parameter a.");
          puts("    --b=X          (-b X): Set the parameter b.");
          puts("    --niters=N     (-I X): Set number of iterations to calculate.");
          puts("");
          return +1;
        }
    }

    return 0;
}

void tick(struct timeval *timer) {
    gettimeofday(timer, NULL);
}

double tock(struct timeval *timer) {
    struct timeval now;
    gettimeofday(&now, NULL);
    return (now.tv_usec-timer->tv_usec)/1.0e6 + (now.tv_sec - timer->tv_sec);
}

运行这个得到：

$  ./batched-saxpb --nvals=10240 --batchsize=10240 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.072 millisec.
GPU time = 0.117 millisec (done with 1 batches of 10240).
CUDA and CPU results differ by 0.000000

$ ./batched-saxpb --nvals=10240 --batchsize=5120 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.066 millisec.
GPU time = 0.133 millisec (done with 2 batches of 5120).
CUDA and CPU results differ by 0.000000

$ ./batched-saxpb --nvals=10240 --batchsize=2560 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.067 millisec.
GPU time = 0.167 millisec (done with 4 batches of 2560).
CUDA and CPU results differ by 0.000000

在这种情况下GPU时间会增加（我们正在做更多的内存副本），但答案保持不变。

编辑：此代码的原始版本有一个选项，用于运行内核的多次迭代以用于计时目的，但这在此上下文中不必要地混淆，因此它被删除。

零碎处理矩阵 - CUDA

1 个答案: