CUDA传递了一系列结构

时间:2019-11-21 05:50:44

标签: cuda

我是cuda的新手,正在尝试并行化以下所示的非常简单的程序,该程序的灵感来自以下链接:https://devblogs.nvidia.com/even-easier-introduction-cuda/

typedef struct{
    int temp;
    int newtemp;
    int neighbors[20];
} S;
void add(int n, S * s){
    for(int i = 0; i < n; i++){
        int newTemp = 0;
        for(int j = 0; j < 20; j++){
            newTemp += s[s[i].neighbors[j]].temp;
        }
        newTemp /= 3;
        s[i].newtemp = newTemp;
    }
}
int main(int argc, char *argv[]){
    int n = 1<<21;
    S grid[n];

    for(int i = 0; i < n; i++){
        S tmp1;
        tmp1.temp = rand();
        for(int j = 0; j<20; j++){
            tmp1.neighbors[j] = rand()%n;
        }
        grid[i] = tmp1;
    }
    struct timespec start, end;
    double gettime_diff, time_diff; 
    clock_t t, starttime, endtime; 

    clock_gettime(CLOCK_REALTIME, &start);
    t = clock(); 
    time(&starttime);

    add(n,grid);

    for(int i = 0; i < n; i++){
        grid[i].temp = grid[i].newtemp;
        if(i%83940==1)printf("%d\n",grid[i].temp);
    }
    return 0;
}

但是我没有得到理想的结果,因为当我更新temp时,所有新值均为0。我认为问题是因为无法传递给设备内存中传递给我的add函数的结构数组。但是,我很难解决该问题。我在stackoverflow上找到了这篇文章,并不确定所建议的答案是如何解决此问题的:Array of structs of arrays CUDA C

我要参考的cuda代码在这里:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define SIZE 1000
#define NS_PER_US 1000

typedef struct{
    int temp;
    int newtemp;
    int neighbors[20];
} S;
__global__ void add(int n, S * s){
    int index = threadIdx.x;
    int stride = blockDim.x;
    //printf("%d\n",(n-index)/stride);
    //printf("%d\n",s[0].temp);
    for(int i = index; i < n; i+=stride){
        printf("%d\n",index);
        int newTemp = 0;
        for(int j = 0; j < 20; j++){
            newTemp += s[s[i].neighbors[j]].temp;
        }
        printf("%d\n",index);
        newTemp /= 3;
        s[i].newtemp = newTemp;
    }
}

int main(int argc, char *argv[]){
    int  *h_a;
    int  *d_a;
    int  num_blocks= 2;
    int  num_th_per_blk= 5;

    int n = 1<<21;
    S grid[n];

    for(int i = 0; i < n; i++){
        S tmp1;
        tmp1.temp = rand();
        for(int j = 0; j<20; j++){
            tmp1.neighbors[j] = rand()%n;
        }
        grid[i] = tmp1;
    }
    struct timespec start, end;
    double gettime_diff, time_diff; 
    clock_t t, starttime, endtime; 

    clock_gettime(CLOCK_REALTIME, &start);
    t = clock(); 
    time(&starttime);

    size_t  memSize;
    memSize = num_blocks* num_th_per_blk* sizeof(int);
    h_a= (int*) malloc(memSize);

    cudaMallocManaged((void **)&grid, n * sizeof(S));
    cudaMalloc( (void**) &d_a, memSize);
    dim3  dimGrid(num_blocks);
    dim3  dimBlock(num_th_per_blk);    

    add<<< dimGrid, dimBlock >>>(n,grid);

    cudaMemcpy( h_a, d_a, memSize,cudaMemcpyDeviceToHost);

    for(int i = 0; i < n; i++){
        grid[i].temp = grid[i].newtemp;
        if(i%83940==1)printf("%d\n",grid[i].newtemp);
    }
    clock_gettime(CLOCK_REALTIME, &end); 
    t = clock() - t; 
    time(&endtime);
    gettime_diff = (double) ((end.tv_sec - start.tv_sec)*CLOCKS_PER_SEC) + (double)((end.tv_nsec - start.tv_nsec)/NS_PER_US);
    time_diff = difftime(endtime, starttime);

    printf("\ttime (clock_gettime) %f\n", gettime_diff);
    printf("\ttime (clock) %f\n", ((float)t)/CLOCKS_PER_SEC);
    printf("\ttime (time) %f\n", time_diff); 

    return 0;
}

我觉得这里没有一个简单的修复程序,或者我可能错过了一个关键概念。无论如何,将不胜感激。

1 个答案:

答案 0 :(得分:2)

实际上,您的代码中有很多错误,如此之多,以致于发布工作版本比指出所有单个错误更容易:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define NS_PER_US 1000

typedef struct{
    int temp;
    int newtemp;
    int neighbors[20];
} S;

__global__ 
void add(int n, S * s)
{
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;
    for(int i = index; i < n; i+=stride){
        int newTemp = 0;
        for(int j = 0; j < 20; j++){
            newTemp += s[s[i].neighbors[j]].temp;
        }
        newTemp /= 3;
        s[i].newtemp = newTemp;
    }
}

int main(int argc, char *argv[]){
    int n = 1<<10;
    S* grid;
    cudaMallocManaged((void **)&grid, n * sizeof(S));

    for(int i = 0; i < n; i++){
        S tmp1;
        tmp1.temp = rand()%n;
        for(int j = 0; j<20; j++){
            tmp1.neighbors[j] = rand()%n;
        }
        grid[i] = tmp1;
    }
    struct timespec start, end;
    double gettime_diff, time_diff; 
    clock_t t, starttime, endtime; 

    clock_gettime(CLOCK_REALTIME, &start);
    t = clock(); 
    time(&starttime);

    int  num_th_per_blk= 32;
    int  num_blocks= (n / num_th_per_blk) + (n % num_th_per_blk > 0) ? 1 : 0;

    dim3  dimGrid(num_blocks);
    dim3  dimBlock(num_th_per_blk);    

    add<<< dimGrid, dimBlock >>>(n,grid);
    cudaDeviceSynchronize();

    for(int i = 0; i < n; i++){
        grid[i].temp = grid[i].newtemp;
        if(i%10==1)printf("%d %d\n",i,grid[i].temp);
    }
    clock_gettime(CLOCK_REALTIME, &end); 
    t = clock() - t; 
    time(&endtime);
    gettime_diff = (double) ((end.tv_sec - start.tv_sec)*CLOCKS_PER_SEC) + (double)((end.tv_nsec - start.tv_nsec)/NS_PER_US);
    time_diff = difftime(endtime, starttime);

    printf("\ttime (clock_gettime) %f\n", gettime_diff);
    printf("\ttime (clock) %f\n", ((float)t)/CLOCKS_PER_SEC);
    printf("\ttime (time) %f\n", time_diff); 

    return 0;
}

最严重的错误是如何处理主机代码中的grid。这样做:

S grid[n];

// code initializing grid

cudaMallocManaged((void **)&grid, n * sizeof(S));

是非法的(您不应该尝试将grid设置为另一个指针值,它不是一个指针),而且是荒谬的。 cudaMallocManaged分配了新的内存,因此您要做的就是初始化grid,然后丢弃所有经过仔细初始化的内存,并用传递给内核的未初始化的内存代替。然后,内核对随机数据进行操作。还要注意,内核中的网格跨度循环也是不正确的,由于使用rand()在两个版本中初始化结构的临时成员的方式,原始代码和CUDA版本都可能遭受整数溢出。 >