如何使gpu上的apriori实现比cpu上的相同代码更快地工作?

时间:2019-05-14 15:46:10

标签: c++ performance optimization cuda gpu

我正在一个项目中,我必须在gpu上实现apriori算法以获得最佳性能。我有两个内核代码:

__global__ void generate_L(int d_input_array[],int check_array[],  int 
out_of_range)
{
int index = blockIdx.x*blockDim.x + threadIdx.x;
    if((index<out_of_range))
    {
        if(d_input_array[index] >= MIN_SUP)
        {
            check_array[index] = 1;
        }
        else
        {
            check_array[index] = 0;
        }
    }
}

第二:

__device__ int EQUAL;
__global__ void set_count_on_gpu(int a[], int b[], int a_size, int 
b_size)
{
int index = blockIdx.x*blockDim.x + threadIdx.x;
    if (b_size<=a_size)
    {
            for (int j = 0; j < a_size; ++j)
            {
                if(b[index]==a[j])
                {
                    EQUAL++;
                    break;
                }
            }
    }
}

内核调用部分:

int * a_array = a.data();
int array_size = C.begin()->first.size();
int * b_array;// = (int *)malloc(C.begin()->first.size() * 
sizeof(int));
int * d_a_array;
int * d_b_array;
cudaMalloc((void**)&d_b_array, array_size * sizeof(int));
cudaMalloc((void**)&d_a_array, a.size()*sizeof(int));
cudaMemcpy(d_a_array, a_array, 
a.size()*sizeof(int),cudaMemcpyHostToDevice);
FOR_MAP(ii,C)
{
    VI b;
    b.clear();
    b=ii->first;
    b_array = b.data();
    cudaMemcpy(d_b_array, b_array, array_size * 
    sizeof(int),cudaMemcpyHostToDevice);
    int block_num = 1;
    int eq = 0;
    cudaMemcpyToSymbol(EQUAL,&eq,sizeof(int));
    set_count_on_gpu<<<block_num,array_size>>>(d_a_array,d_b_array, 
    a.size(),array_size);
    cudaMemcpyFromSymbol(&eq, EQUAL,sizeof(int));
    if (eq==b.size())
    {
        ii->second++;
    }
}

第二次通话:

while(true)
{
        if(index>7)
            break;
        generate_C();
        if(C.size()==0)
            break;
        cout<<"\nC"<<index<<"\n";
        output(C);
        prune();
        if (C.size()==0)
        {
            break;
        }
        cout<<"\nC"<<index<<" after prune \n";
        output(C);
        scan_D();
        cout<<"\nC"<<index<<"after scaning dataset \n";
        output(C);
        L.clear();
        int * check_array = (int *) malloc(C.size()*sizeof(int));
        int * input_array = (int *) malloc(C.size()*sizeof(int));
        int * d_check_array;
        int * d_input_array;
        cudaMalloc((void**)&d_check_array,  C.size()*sizeof(int));
        cudaMalloc((void**)&d_input_array,  C.size()*sizeof(int));
        FOR_MAP(ii,C)
        {
            input_array[thr] = ii->second;
            thr++;
        }
        cudaMemcpy(d_input_array, 
        input_array,C.size()*sizeof(int),cudaMemcpyHostToDevice);
        int block_num = 1;
        if(C.size() < MAX_THREADS)
        {
            generate_L<<<block_num,C.size()>>> 
        (d_input_array,d_check_array, C.size());
        }
        else
        {
            block_num  = C.size()/MAX_THREADS;
            generate_L<<<block_num,C.size()/block_num>>> 
        (d_input_array,d_check_array, C.size());
        }
        cudaMemcpy(check_array, 
        d_check_array,C.size()*sizeof(int),cudaMemcpyDeviceToHost);
        thr = 0;
        FOR_MAP(ii,C)
        {
            if(check_array[thr] == 1)
            {
                L[ii->first] = ii->second;
            }
            thr++;
        }
        if (L.size()==0)
        {
            break;
        }
        cout<<"\nL"<<index<<"\n";
        output(L);
        index++;
        thr = 0;
        free(check_array);
        cudaFree(d_check_array);
        free(input_array);
        cudaFree(d_input_array);
        //generate_L();
}

我试图在不使用gpu的情况下运行实现,但是测量表明,在cpu代码上运行速度更快。

每个内核调用的块和线程大小都不相同。

这是我修改了Apriori-Implementation-On-Cpu的cpu实现。

如何优化它?

0 个答案:

没有答案