Cuda共享内存是否不比全局内存快?

时间:2019-03-29 09:38:08

标签: performance memory cuda nvidia

我正在尝试使用共享内存来提高测试代码的速度。代码include while循环从数组A和数组B执行一些计算,然后退出并将结果写入数组C。

问题在于,如果程序使用threads_per_block = 1,则共享内存快25%;如果程序使用threads_per_block = 1024,则共享内存快0%。

我期望共享内存可以提高速度。如果有问题,哪里出了问题?

我的系统规格:

i7 8750h 16GB双通道DDR4 2666 MHz内存 GTX1060移动6GB

#include <stdio.h>
#include <stdint.h>
#include <time.h>
#include <math.h>

#define threads_per_block 1024
#define blocks_per_grid  1024
#define shared_memory_size threads_per_block * 4 * 3

float clock_t1 = 0, clock_t2 = 0, clock_dt = 0;

void VectorAdd_cpu(float *A, float *B, float *C, int64_t n, int64_t repeat) {
    uint32_t index = 0, j = 0;
    for (index = 0; index < n; index++) {
        j = 0;
        while (j < repeat) {
            j++;
            C[index] = A[index] + B[index];
        }
    }
}

__global__ void VectorAdd_gpu(float *Ac, float *Bc, float *Cc, uint32_t n, uint32_t repeat) {
    uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;
    uint32_t j;
    if (index < n) {
        j = 0;
        while (j < repeat) {
            j++;
            Cc[index] = Ac[index] + Bc[index];
        }
    }
}

__global__ void VectorAdd_gpu_shared(float *Acs, float *Bcs, float *Ccs, uint32_t n, uint32_t repeat) {
    uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;
    uint32_t tx = threadIdx.x;
    uint32_t j;

    float *Acs_temp, *Bcs_temp, *Ccs_temp;
    extern __shared__ float S[];

    Acs_temp = (float*)S;
    Bcs_temp = (float*)Acs_temp + threads_per_block;
    Ccs_temp = (float*)Bcs_temp + threads_per_block;

    __syncthreads();

    if (index < n) {

        Acs_temp[tx] = Acs[index];
        Bcs_temp[tx] = Bcs[index];
        Ccs_temp[tx] = Ccs[index];

        j = 0;
        while (j < repeat) {
            j++;
            Ccs_temp[tx] = Acs_temp[tx] + Bcs_temp[tx];
        }
        Acs[index] = Acs_temp[tx];
        Bcs[index] = Bcs_temp[tx];
        Ccs[index] = Ccs_temp[tx];
    }
}

int main() {
    int nDevices;
    cudaError_t err = cudaGetDeviceCount(&nDevices);
    int64_t lenArray = threads_per_block * blocks_per_grid;
    int64_t repeat = 0;
    float *A, *B, *C, *Ac, *Bc, *Cc, *Acs, *Bcs, *Ccs;
    uint64_t Size_array = lenArray * sizeof(float);

    clock_t1 = float(clock());
    A = (float *)malloc(Size_array);
    B = (float *)malloc(Size_array);
    C = (float *)malloc(Size_array);
    clock_t2 = float(clock());printf("Cpu memory allocation : %.f ms\n", clock_t2 - clock_t1);

    clock_t1 = float(clock());
    cudaMallocManaged(&Ac, Size_array);
    cudaMallocManaged(&Bc, Size_array);
    cudaMallocManaged(&Cc, Size_array);
    cudaMallocManaged(&Acs, Size_array);
    cudaMallocManaged(&Bcs, Size_array);
    cudaMallocManaged(&Ccs, Size_array);
    clock_t2 = float(clock());printf("Gpu memory allocation : %.f ms\n", clock_t2 - clock_t1);

    clock_t1 = float(clock());
    for (int64_t i = 0; i < lenArray; i++) {
        A[i] = i;
        B[i] = i;
        C[i] = 0;
    }
    clock_t2 = float(clock());printf("Cpu setting variable : %.f ms\n", clock_t2 - clock_t1);

    clock_t1 = float(clock());printf("Cpu to Gpu copy variable :");
    cudaMemcpy(Ac, A, Size_array, cudaMemcpyHostToDevice);
    cudaMemcpy(Bc, B, Size_array, cudaMemcpyHostToDevice);
    cudaMemcpy(Acs, A, Size_array, cudaMemcpyHostToDevice);
    cudaMemcpy(Bcs, B, Size_array, cudaMemcpyHostToDevice);
    clock_t2 = float(clock());printf("%.f ms\n", clock_t2 - clock_t1);

    float temp1 = 0, temp2 = 0, temp3 = 0;
    for (int i = 0;i <= 100;i++) {
        repeat = pow(2, i);printf("Repeat : 2^%d\t", i);
        clock_t1 = float(clock());printf("Cpu execution : ");
        VectorAdd_cpu(A, B, C, lenArray, repeat);
        clock_t2 = float(clock());printf("%.f\tms ", clock_t2 - clock_t1);
        temp1 = clock_t2 - clock_t1;
        clock_t1 = float(clock());printf("Gpu execution : ");
        VectorAdd_gpu << <blocks_per_grid, threads_per_block >> > (Ac, Bc, Cc, lenArray, repeat);cudaDeviceSynchronize();
        clock_t2 = float(clock());printf("%.f\tms ", clock_t2 - clock_t1);
        temp2 = clock_t2 - clock_t1;
        clock_t1 = float(clock());printf("Gpu_shared execution : ");
        VectorAdd_gpu_shared << <blocks_per_grid, threads_per_block, shared_memory_size >> > (Acs, Bcs, Ccs, lenArray, repeat);cudaDeviceSynchronize();
        clock_t2 = float(clock());printf("%.f\tms ", clock_t2 - clock_t1);
        temp3 = clock_t2 - clock_t1;
        printf("Gpu/Cpu : %.2f\t", temp1 / temp2);
        printf("Gpu_shared/Gpu : %.2f\n", temp2 / temp3);
    }
    free(A);free(B);free(C);
    cudaFree(Ac);cudaFree(Bc);cudaFree(Cc);
    cudaFree(Acs);cudaFree(Bcs);cudaFree(Ccs);

    printf("\n\nDone...\n\n");
    return 1;
}

代码结果

Cpu memory allocation : 5 ms
Gpu memory allocation : 150 ms
Cpu setting variable : 2 ms
Cpu to Gpu copy variable :4 ms

Repeat : 2^0    Cpu execution : 9       ms Gpu execution : 5    ms Gpu_shared execution : 9     ms Gpu/Cpu : 1.80      Gpu_shared/Gpu : 0.56
Repeat : 2^1    Cpu execution : 21      ms Gpu execution : 8    ms Gpu_shared execution : 9     ms Gpu/Cpu : 2.63      Gpu_shared/Gpu : 0.89
Repeat : 2^2    Cpu execution : 39      ms Gpu execution : 8    ms Gpu_shared execution : 11    ms Gpu/Cpu : 4.88      Gpu_shared/Gpu : 0.73
Repeat : 2^3    Cpu execution : 77      ms Gpu execution : 3    ms Gpu_shared execution : 4     ms Gpu/Cpu : 25.67     Gpu_shared/Gpu : 0.75
Repeat : 2^4    Cpu execution : 159     ms Gpu execution : 6    ms Gpu_shared execution : 7     ms Gpu/Cpu : 26.50     Gpu_shared/Gpu : 0.86
Repeat : 2^5    Cpu execution : 310     ms Gpu execution : 12   ms Gpu_shared execution : 12    ms Gpu/Cpu : 25.83     Gpu_shared/Gpu : 1.00
Repeat : 2^6    Cpu execution : 620     ms Gpu execution : 22   ms Gpu_shared execution : 23    ms Gpu/Cpu : 28.18     Gpu_shared/Gpu : 0.96
Repeat : 2^7    Cpu execution : 1249    ms Gpu execution : 44   ms Gpu_shared execution : 44    ms Gpu/Cpu : 28.39     Gpu_shared/Gpu : 1.00
Repeat : 2^8    Cpu execution : 2487    ms Gpu execution : 87   ms Gpu_shared execution : 88    ms Gpu/Cpu : 28.59     Gpu_shared/Gpu : 0.99
Repeat : 2^9    Cpu execution : 4971    ms Gpu execution : 174  ms Gpu_shared execution : 163   ms Gpu/Cpu : 28.57     Gpu_shared/Gpu : 1.07
Repeat : 2^10   Cpu execution : 9935    ms Gpu execution : 271  ms Gpu_shared execution : 266   ms Gpu/Cpu : 36.66     Gpu_shared/Gpu : 1.02

编辑:我已在编译器选项中禁用了cuda优化,并且结果出奇地快。我也改变了

C[index] = A[index] + B[index];

C[index] += A[index] + B[index];

否则,共享内存在0-1毫秒内完成也将重复= 2 ^ 1000。我认为这是另一个问题。我已经用“ +”和“ + =”检查了C []数组的结果。 CPU,GPU,GPU_shared的结果相同。我认为这是另一个编译器问题。它不计算它的“重复”次数。如果我们添加“ + =”,则可以确定该程序将其计算为“重复”次数。新的代码在这里

#include <stdio.h>
#include <stdint.h>
#include <time.h>
#include <math.h>

#define threads_per_block 1024
#define blocks_per_grid  1024
#define shared_memory_size threads_per_block * 4 * 3

float clock_t1 = 0, clock_t2 = 0, clock_dt = 0;

void VectorAdd_cpu(float *A, float *B, float *C, int64_t n, int64_t repeat) {
    uint32_t index = 0, j = 0;
    for (index = 0; index < n; index++) {
        j = 0;
        while (j < repeat) {
            j++;
            C[index] += A[index] + B[index];
        }
    }
}

__global__ void VectorAdd_gpu(float *Ac, float *Bc, float *Cc, uint32_t n, uint32_t repeat) {
    uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;
    uint32_t j;
    if (index < n) {
        j = 0;
        while (j < repeat) {
            j++;
            Cc[index] += Ac[index] + Bc[index];
        }
    }
}

__global__ void VectorAdd_gpu_shared(float *Acs, float *Bcs, float *Ccs, uint32_t n, uint32_t repeat) {
    uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;
    uint32_t tx = threadIdx.x;
    uint32_t j;

    float *Acs_temp, *Bcs_temp, *Ccs_temp;
    extern __shared__ float S[];

    Acs_temp = (float*)S;
    Bcs_temp = (float*)Acs_temp + threads_per_block;
    Ccs_temp = (float*)Bcs_temp + threads_per_block;

    __syncthreads();

    if (index < n) {

        Acs_temp[tx] = Acs[index];
        Bcs_temp[tx] = Bcs[index];
        Ccs_temp[tx] = Ccs[index];

        j = 0;
        while (j < repeat) {
            j++;
            Ccs_temp[tx] += Acs_temp[tx] + Bcs_temp[tx];
        }
        Acs[index] = Acs_temp[tx];
        Bcs[index] = Bcs_temp[tx];
        Ccs[index] = Ccs_temp[tx];
    }
}

int main() {
    int64_t lenArray = threads_per_block * blocks_per_grid;
    int64_t repeat = 0;
    float *A, *B, *C, *Ac, *Bc, *Cc, *Acs, *Bcs, *Ccs;
    uint64_t Size_array = lenArray * sizeof(float);

    clock_t1 = float(clock());
    A = (float *)malloc(Size_array);
    B = (float *)malloc(Size_array);
    C = (float *)malloc(Size_array);
    clock_t2 = float(clock());printf("Cpu memory allocation : %.f ms\n", clock_t2 - clock_t1);

    clock_t1 = float(clock());
    cudaMallocManaged(&Ac, Size_array);
    cudaMallocManaged(&Bc, Size_array);
    cudaMallocManaged(&Cc, Size_array);
    cudaMallocManaged(&Acs, Size_array);
    cudaMallocManaged(&Bcs, Size_array);
    cudaMallocManaged(&Ccs, Size_array);
    clock_t2 = float(clock());printf("Gpu memory allocation : %.f ms\n", clock_t2 - clock_t1);

    clock_t1 = float(clock());
    for (int64_t i = 0; i < lenArray; i++) {
        A[i] = i;
        B[i] = i;
        C[i] = 0;
    }
    clock_t2 = float(clock());printf("Cpu setting variable : %.f ms\n", clock_t2 - clock_t1);

    clock_t1 = float(clock());printf("Cpu to Gpu copy variable :");
    cudaMemcpy(Ac, A, Size_array, cudaMemcpyHostToDevice);
    cudaMemcpy(Bc, B, Size_array, cudaMemcpyHostToDevice);
    cudaMemcpy(Acs, A, Size_array, cudaMemcpyHostToDevice);
    cudaMemcpy(Bcs, B, Size_array, cudaMemcpyHostToDevice);
    clock_t2 = float(clock());printf("%.f ms\n", clock_t2 - clock_t1);

    float temp1 = 0, temp2 = 0, temp3 = 0;
    for (int i = 0;i <= 12;i++) {
        repeat = pow(2, i);printf("Repeat : 2^%d\t", i);
        clock_t1 = float(clock());printf("Cpu execution : ");
        VectorAdd_cpu(A, B, C, lenArray, repeat);
        clock_t2 = float(clock());printf("%.f\tms ", clock_t2 - clock_t1);
        temp1 = clock_t2 - clock_t1;
        clock_t1 = float(clock());printf("Gpu execution : ");
        VectorAdd_gpu << <blocks_per_grid, threads_per_block >> > (Ac, Bc, Cc, lenArray, repeat);cudaDeviceSynchronize();
        clock_t2 = float(clock());printf("%.f\tms ", clock_t2 - clock_t1);
        temp2 = clock_t2 - clock_t1;
        clock_t1 = float(clock());printf("Gpu_shared execution : ");
        VectorAdd_gpu_shared << <blocks_per_grid, threads_per_block, shared_memory_size >> > (Acs, Bcs, Ccs, lenArray, repeat);cudaDeviceSynchronize();
        clock_t2 = float(clock());printf("%.f\tms ", clock_t2 - clock_t1);
        temp3 = clock_t2 - clock_t1;
        printf("Gpu/Cpu : %.2f\t", temp1 / temp2);
        printf("Gpu_shared/Gpu : %.2f\n", temp2 / temp3);
    }
    free(A);free(B);free(C);
    cudaFree(Ac);cudaFree(Bc);cudaFree(Cc);
    cudaFree(Acs);cudaFree(Bcs);cudaFree(Ccs);

    printf("\n\nDone...\n\n");
    return 1;
}

cpu的结果

Cpu memory allocation : 0 ms
Gpu memory allocation : 154 ms
Cpu setting variable : 5 ms
Cpu to Gpu copy variable :3 ms

Repeat : 2^0    Cpu execution : 3       ms Gpu execution : 1    ms Gpu_shared execution : 1     ms Gpu/Cpu : 3.00      Gpu_shared/Gpu : 1.00
Repeat : 2^1    Cpu execution : 6       ms Gpu execution : 1    ms Gpu_shared execution : 1     ms Gpu/Cpu : 6.00      Gpu_shared/Gpu : 1.00
Repeat : 2^2    Cpu execution : 11      ms Gpu execution : 1    ms Gpu_shared execution : 1     ms Gpu/Cpu : 11.00     Gpu_shared/Gpu : 1.00
Repeat : 2^3    Cpu execution : 22      ms Gpu execution : 2    ms Gpu_shared execution : 1     ms Gpu/Cpu : 11.00     Gpu_shared/Gpu : 2.00
Repeat : 2^4    Cpu execution : 45      ms Gpu execution : 3    ms Gpu_shared execution : 1     ms Gpu/Cpu : 15.00     Gpu_shared/Gpu : 3.00
Repeat : 2^5    Cpu execution : 89      ms Gpu execution : 2    ms Gpu_shared execution : 1     ms Gpu/Cpu : 44.50     Gpu_shared/Gpu : 2.00
Repeat : 2^6    Cpu execution : 178     ms Gpu execution : 2    ms Gpu_shared execution : 0     ms Gpu/Cpu : 89.00     Gpu_shared/Gpu : inf
Repeat : 2^7    Cpu execution : 357     ms Gpu execution : 2    ms Gpu_shared execution : 1     ms Gpu/Cpu : 178.50    Gpu_shared/Gpu : 2.00
Repeat : 2^8    Cpu execution : 718     ms Gpu execution : 5    ms Gpu_shared execution : 0     ms Gpu/Cpu : 143.60    Gpu_shared/Gpu : inf
Repeat : 2^9    Cpu execution : 1400    ms Gpu execution : 9    ms Gpu_shared execution : 1     ms Gpu/Cpu : 155.56    Gpu_shared/Gpu : 9.00
Repeat : 2^10   Cpu execution : 2790    ms Gpu execution : 17   ms Gpu_shared execution : 1     ms Gpu/Cpu : 164.12    Gpu_shared/Gpu : 17.00
Repeat : 2^11   Cpu execution : 5566    ms Gpu execution : 34   ms Gpu_shared execution : 2     ms Gpu/Cpu : 163.71    Gpu_shared/Gpu : 17.00
Repeat : 2^12   Cpu execution : 11124   ms Gpu execution : 67   ms Gpu_shared execution : 4     ms Gpu/Cpu : 166.03    Gpu_shared/Gpu : 16.75


Done...

我也没有cpu进行了测试,因为;太慢,无法清楚地看到差异。

//VectorAdd_cpu(A, B, C, lenArray, repeat);

Cpu memory allocation : 0 ms
Gpu memory allocation : 199 ms
Cpu setting variable : 6 ms
Cpu to Gpu copy variable :3 ms

Repeat : 2^0    Cpu execution : 0       ms Gpu execution : 0    ms Gpu_shared execution : 0     ms Gpu/Cpu : -nan(ind)  Gpu_shared/Gpu : -nan(ind)
Repeat : 2^1    Cpu execution : 0       ms Gpu execution : 1    ms Gpu_shared execution : 0     ms Gpu/Cpu : 0.00       Gpu_shared/Gpu : inf
Repeat : 2^2    Cpu execution : 0       ms Gpu execution : 1    ms Gpu_shared execution : 0     ms Gpu/Cpu : 0.00       Gpu_shared/Gpu : inf
Repeat : 2^3    Cpu execution : 0       ms Gpu execution : 1    ms Gpu_shared execution : 0     ms Gpu/Cpu : 0.00       Gpu_shared/Gpu : inf
Repeat : 2^4    Cpu execution : 0       ms Gpu execution : 1    ms Gpu_shared execution : 0     ms Gpu/Cpu : 0.00       Gpu_shared/Gpu : inf
Repeat : 2^5    Cpu execution : 0       ms Gpu execution : 1    ms Gpu_shared execution : 0     ms Gpu/Cpu : 0.00       Gpu_shared/Gpu : inf
Repeat : 2^6    Cpu execution : 0       ms Gpu execution : 2    ms Gpu_shared execution : 0     ms Gpu/Cpu : 0.00       Gpu_shared/Gpu : inf
Repeat : 2^7    Cpu execution : 0       ms Gpu execution : 3    ms Gpu_shared execution : 0     ms Gpu/Cpu : 0.00       Gpu_shared/Gpu : inf
Repeat : 2^8    Cpu execution : 1       ms Gpu execution : 4    ms Gpu_shared execution : 1     ms Gpu/Cpu : 0.25       Gpu_shared/Gpu : 4.00
Repeat : 2^9    Cpu execution : 0       ms Gpu execution : 9    ms Gpu_shared execution : 1     ms Gpu/Cpu : 0.00       Gpu_shared/Gpu : 9.00
Repeat : 2^10   Cpu execution : 0       ms Gpu execution : 18   ms Gpu_shared execution : 1     ms Gpu/Cpu : 0.00       Gpu_shared/Gpu : 18.00
Repeat : 2^11   Cpu execution : 0       ms Gpu execution : 35   ms Gpu_shared execution : 2     ms Gpu/Cpu : 0.00       Gpu_shared/Gpu : 17.50
Repeat : 2^12   Cpu execution : 0       ms Gpu execution : 73   ms Gpu_shared execution : 4     ms Gpu/Cpu : 0.00       Gpu_shared/Gpu : 18.25
Repeat : 2^13   Cpu execution : 1       ms Gpu execution : 133  ms Gpu_shared execution : 6     ms Gpu/Cpu : 0.01       Gpu_shared/Gpu : 22.17
Repeat : 2^14   Cpu execution : 0       ms Gpu execution : 246  ms Gpu_shared execution : 11    ms Gpu/Cpu : 0.00       Gpu_shared/Gpu : 22.36
Repeat : 2^15   Cpu execution : 0       ms Gpu execution : 438  ms Gpu_shared execution : 20    ms Gpu/Cpu : 0.00       Gpu_shared/Gpu : 21.90
Repeat : 2^16   Cpu execution : 1       ms Gpu execution : 881  ms Gpu_shared execution : 39    ms Gpu/Cpu : 0.00       Gpu_shared/Gpu : 22.59
Repeat : 2^17   Cpu execution : 0       ms Gpu execution : 1768 ms Gpu_shared execution : 78    ms Gpu/Cpu : 0.00       Gpu_shared/Gpu : 22.67
Repeat : 2^18   Cpu execution : 0       ms Gpu execution : 3508 ms Gpu_shared execution : 158   ms Gpu/Cpu : 0.00       Gpu_shared/Gpu : 22.20
Repeat : 2^19   Cpu execution : 0       ms Gpu execution : 7068 ms Gpu_shared execution : 296   ms Gpu/Cpu : 0.00       Gpu_shared/Gpu : 23.88

Done...

0 个答案:

没有答案