CUDA哪个更快?内存合并与缓存?

时间:2017-12-09 21:25:33

标签: caching cuda

我遇到过这个练习,询问下面两个代码之间哪个代码更快。

第一个代码。

xi = i^2

第二个代码。

int sum = 0;
for(int i = 0; i < n; i++) {
    sum += array[i*n + thread_id];
}

我会自己尝试代码,在接下来的几天里我不会有Nvidia GPU。 我认为第一个代码利用了内存合并see here,而第二个代码利用了缓存。

1 个答案:

答案 0 :(得分:1)

非常感谢@RobertCrovella澄清有关内存合并的问题。这是我尝试按照要求对两个代码进行基准测试。从输出(在NVS5400M GPU笔记本电脑上运行)可以清楚地看出,与第二个代码相比,第一个代码的效率提高了两倍。这是因为内存合并发生在第一个(kernel1)中。

#include <cuda.h>
#include <ctime>
#include <iostream>
#include <stdio.h>
using namespace std;

#define BLOCK_SIZE 1024
#define GRID_SIZE 1024

// Error Handling
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

//kernel1<<<8,8>>>(d_array,d_sum1,n);

__global__ void kernel1(int *array, long *sum, int n) {
    long result=0;

    int thread_id=threadIdx.x+blockIdx.x*blockDim.x;

    for(int i=0;i<n;i++) {
        result += array[i*n + thread_id];
    }
    //__syncthreads();
    sum[thread_id]=result;
}

__global__ void kernel2(int *array, long *sum, int n) {
    long result=0;

    int thread_id=threadIdx.x+blockIdx.x*blockDim.x;

    for(int i=0;i<n;i++) {
        result += array[n*thread_id+i];
    }
    __syncthreads();
    sum[thread_id]=result;
}


int main() {
    srand((unsigned)time(0));

    long *h_sum1,*d_sum1;
    long *h_sum2,*d_sum2;
    int n=10;
    int size1=n*BLOCK_SIZE*GRID_SIZE+n;
    int *h_array;

    h_array=new int[size1];
    h_sum1=new long[size1];
    h_sum2=new long[size1];

    //random number range
    int min =1, max =10;
    for(int i=0;i<size1;i++) {
        h_array[i]= min + (rand() % static_cast<int>(max - min + 1));
        h_sum1[i]=0;
        h_sum2[i]=0;
    }

    int *d_array;
    gpuErrchk(cudaMalloc((void**)&d_array,size1*sizeof(int)));
    gpuErrchk(cudaMalloc((void**)&d_sum1,size1*sizeof(long)));

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    gpuErrchk(cudaMemcpy(d_array,h_array,size1*sizeof(int),cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_sum1,h_sum1,size1*sizeof(long),cudaMemcpyHostToDevice));



    cudaEventRecord(start);
    kernel1<<<GRID_SIZE,BLOCK_SIZE>>>(d_array,d_sum1,n);
    cudaEventRecord(stop);

    gpuErrchk(cudaMemcpy(h_sum1,d_sum1,size1*sizeof(long),cudaMemcpyDeviceToHost));

    float milliSeconds1=0;
    cudaEventElapsedTime(&milliSeconds1,start,stop);

    gpuErrchk(cudaMalloc((void**)&d_sum2,size1*sizeof(long)));
    gpuErrchk(cudaMemcpy(d_sum2,h_sum2,size1*sizeof(long),cudaMemcpyHostToDevice));

    cudaEventRecord(start);
    kernel2<<<GRID_SIZE,BLOCK_SIZE>>>(d_array,d_sum2,10);
    cudaEventRecord(stop);

    gpuErrchk(cudaMemcpy(h_sum2,d_sum2,size1*sizeof(long),cudaMemcpyDeviceToHost));


    float milliSeconds2=0;
    cudaEventElapsedTime(&milliSeconds2,start,stop);

    long result_device1=0,result_host1=0;
    long result_device2=0,result_host2=0;
    for(int i=0;i<size1;i++) {
        result_device1 += h_sum1[i];
        result_device2 += h_sum2[i];
    }


    for(int thread_id=0;thread_id<GRID_SIZE*BLOCK_SIZE;thread_id++)
    for(int i=0;i<10;i++) {
            result_host1 += h_array[i*10+thread_id];
            result_host2 += h_array[10*thread_id+i];
    }

    cout << "Device result1 = " <<  result_device1 << endl;
    cout << "Host result1 = " <<  result_host1 << endl;
    cout << "Time1 (ms) = " << milliSeconds1 << endl;

    cout << "Device result2 = " <<  result_device2 << endl;
    cout << "Host result2 = " <<  result_host2 << endl;
    cout << "Time2 (ms) = " << milliSeconds2 << endl;

    gpuErrchk(cudaFree(d_array));
    gpuErrchk(cudaFree(d_sum1));
    gpuErrchk(cudaFree(d_sum2));

    return 0;
}

Cuda Event计时器输出如下:

Device result1 = 57659226
Host result1 = 57659226
Time1 (ms) = 5.21952
Device result2 = 57674257
Host result2 = 57674257
Time2 (ms) = 11.8356