Question

我有两个程序。唯一的区别是一个使用常量内存来存储输入而另一个使用全局内存。我想知道为什么全局内存比常量内存快一个？他们都计算点积btw 2矩阵

#include<cuda_runtime.h>
#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#define intMin(a,b) ((a<b)?a:b)
//Threads per block
#define TPB 128
//blocks per grid
#define BPG intMin(128, ((n+TPB-1)/TPB))

const int n = 4;
__constant__ float deva[n],devb[n];
__global__ void addVal( float *c){
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    //Using shared memory to temporary store results
    __shared__ float cache[TPB];
    float temp = 0;
    while(tid < n){
        temp += deva[tid] * devb[tid];
        tid += gridDim.x * blockDim.x;


    }
    cache[threadIdx.x] = temp;
    __syncthreads();
    int i = blockDim.x/2;
    while( i !=0){
        if(threadIdx.x < i){
            cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ;

        }
    __syncthreads();
    i = i/2;

    }
    if(threadIdx.x == 1){
        c[blockIdx.x ] = cache[0];
    }



}



int main(){

float a[n] , b[n] , c[BPG];
//float *deva, *devb, *devc;
float *devc;
int i;
//Filling with random values to test
for( i =0; i< n; i++){
    a[i] = i;
    b[i] = i*2;
}

//cudaMalloc((void**)&deva, n * sizeof(float));
//cudaMalloc((void**)&devb, n * sizeof(float));

cudaMalloc((void**)&devc, BPG * sizeof(float));
//cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice);
//cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(deva, a, n * sizeof(float));
cudaMemcpyToSymbol(devb, b, n * sizeof(float));
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);

//Call function to do dot product
addVal<<<BPG, TPB>>>( devc);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time,start, stop);
printf("The elapsed time is: %f\n", time);

//copy result back
cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost);
float sum =0 ;
for ( i = 0 ; i< BPG; i++){
    sum+=c[i];

}
//display answer
printf("%f\n",sum);


getchar();

return 0;
}

以下是全局内存版本。

#include<cuda_runtime.h>
#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#define intMin(a,b) ((a<b)?a:b)
//Threads per block
#define TPB 128
//blocks per grid
#define BPG intMin(128, ((n+TPB-1)/TPB))

const int n = 4;

__global__ void addVal(float *a, float *b, float *c){
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    //Using shared memory to temporary store results
    __shared__ float cache[TPB];
    float temp = 0;
    while(tid < n){
        temp += a[tid] * b[tid];
        tid += gridDim.x * blockDim.x;


    }
    cache[threadIdx.x] = temp;
    __syncthreads();
    int i = blockDim.x/2;
    while( i !=0){
        if(threadIdx.x < i){
            cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ;

        }
    __syncthreads();
    i = i/2;

    }
    if(threadIdx.x == 1){
        c[blockIdx.x ] = cache[0];
    }



}

int main(){

float a[n] , b[n] , c[BPG];
float *deva, *devb, *devc;
int i;
//Filling with random values to test
for( i =0; i< n; i++){
    a[i] = i;
    b[i] = i*2;
}
printf("Not using constant memory\n");
cudaMalloc((void**)&deva, n * sizeof(float));
cudaMalloc((void**)&devb, n * sizeof(float));
cudaMalloc((void**)&devc, BPG * sizeof(float));
cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice);

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);

//Call function to do dot product
addVal<<<BPG, TPB>>>(deva, devb, devc);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time,start, stop);
printf("The elapsed time is: %f\n", time);


//copy result back
cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost);
float sum =0 ;
for ( i = 0 ; i< BPG; i++){
    sum+=c[i];

}
//display answer
printf("%f\n",sum);


getchar();

return 0;
}

Answer 1

你没有获得恒定记忆的优势。

来自常量内存的单个读取可以广播到半warp（不是你的情况，因为每个线程从它自己的tid加载）。
缓存常量内存（在您的情况下不使用，因为您只从常量内存数组中的每个位置读取一次）。

当半经线中的每个线程对不同的数据进行单次读取时，16个不同的读取被序列化，花费16倍的时间来发出请求。

如果他们正在从全局内存中读取，请求将同时完成，合并。这就是为什么你的全局内存示例比常量内存更好。

当然，这个结论可能会随着具有L1和L2缓存的计算能力2.x的设备而变化。

问候！

使用常量内存和全局内存的程序之间的差异

1 个答案: