在CUDA中将变量复制到GPU内存

时间:2014-07-21 16:24:53

标签: cuda gpu

我刚刚进入CUDA而且我遇到了一个我无法理解的问题。基本上我正在编写一个程序来使用Simpson的方法来数值积分函数f(x)= x ^ 2。我这样做的方法是在每个框之间创建一个边界数组,其区域是用Simpson方法计算的,将该边界数组传递给GPU,让每个处理器找到每个有界框的区域并返回一个结果区域的数组。然后将这些区域相加以获得完整的积分。当我尝试访问GPU上的边界数组时,我的问题出现了。数组很好,在CPU上有适当的值,但在我复制并在GPU上访问它之后,这些值都是无意义的,我找不到原因。我的代码如下,任何帮助都将非常感激。

第一个类是获取用户输入并定义CPU数组的主类。

#include <iostream> //Necessary for std::cout
#include <iomanip>  //Necessary for std::setprecision
#include <ctime>    //Necessary for clock_t

using namespace std;

double* gpu_run(double * h_bound, double * h_resultArr, int SIZE);

int main() {

    double step = 0.5, upper = 0, result = 0;
    double * h_bound = NULL, * h_resultArr = NULL;
    int SIZE = 0;

    cout << "Enter the upper bound: ";
    cin >> upper;

    SIZE = upper/step + 1; //The number of bounds, which is one more than the number of integration times

    h_bound = new double[SIZE];
    h_resultArr = new double[SIZE-1];

    for (int i = 0; i < SIZE; i++){
        h_bound[i] = i*step;
    }

    clock_t t = clock();

    h_resultArr = gpu_run(h_bound, h_resultArr, SIZE);

    for (int i = 0; i < SIZE; i++){
        result += h_resultArr[i];
    }

    t = clock() - t;

    cout << "Calculation is done and took " << ((double)t)/CLOCKS_PER_SEC << " seconds." << endl;

    cout << "The integral of x^2 from 0 to " << upper << ", using Simpson's Method is: " << setprecision(10) << result << endl;

    return 0;
}

此时main方法调用了gpu_run,它是包含进行实际计算的方法的cuda代码。当我使用3的上限作为积分时(因为答案应该是9),并且使用0.5的步长,我得到0,0.5,1,1.5,2,2.5和3的界限,因为我期望。 gpu_run代码是

#include <iostream>
#include <cuda_runtime.h>
#include <cstdio>
#include "gpu_run.h"

using namespace std;

double* gpu_run(double * h_bound, double * h_resultArr, int SIZE) {

    double * d_bound = NULL;
    cudaMalloc((void **)&d_bound, sizeof(double)*SIZE);

    cudaMemcpy(d_bound, h_bound, SIZE, cudaMemcpyHostToDevice);

    double * d_resultArr = NULL;
    cudaMalloc((void **)&d_resultArr, sizeof(double)*(SIZE-1));

    int threadsPerBlock = 256;
    int blocksPerGrid =(SIZE + threadsPerBlock - 1) / threadsPerBlock;
    simpsons<<<blocksPerGrid, threadsPerBlock>>>(d_bound, d_resultArr, SIZE);

    cudaMemcpy(h_resultArr, d_resultArr, SIZE-1, cudaMemcpyDeviceToHost);

    return h_resultArr;

}

在这个程序中,d_ prescript用于指定GPU上存在的数组,存储在h_bound中的值在这里仍然是正确的。最后,我有CUDA方法simpsons的标题,称为

__global__ void simpsons(double * bound, double * resultArr, int SIZE){

    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < SIZE-1){
        double a = bound[i];
        double b = bound[i+1];

        printf("i: %d lower bound: %d  upper bound: %d \n", i, bound[i], bound[i+1]);

        resultArr[i] = ((b-a)/6)*(a*a + (a+b)*(a+b) + b*b); 
    }

}

对于每个处理器,我需要它来访问其各自&#34;框的两个边界&#34;在函数下并使用simpson方法在这两个边界处计算面积,但是,此方法中绑定数组中的值是无意义值。我究竟做错了什么?我觉得这是一个非常愚蠢的错误,但我似乎无法找到它。

1 个答案:

答案 0 :(得分:1)

当我更改您的代码以解决@talonmies指出的问题,并将printf格式说明符修改为正确的打印出float / double数量的问题时,我得到的似乎是根据您的描述更正结果:

  1. 修改cudaMemcpy操作以包含sizeof(double)
  2. 在打印浮点数量时将%d替换为%f
  3. 我还添加了一个演员,以消除有关将double转换为int
  4. 的警告

    修正了代码和结果:

    $ cat t495.cu
    #include <iostream> //Necessary for std::cout
    #include <iomanip>  //Necessary for std::setprecision
    #include <ctime>    //Necessary for clock_t
    
    using namespace std;
    
    __global__ void simpsons(double * bound, double * resultArr, int SIZE){
    
        int i = blockDim.x * blockIdx.x + threadIdx.x;
    
        if (i < SIZE-1){
            double a = bound[i];
            double b = bound[i+1];
    
            printf("i: %d lower bound: %lf  upper bound: %lf \n", i, bound[i], bound[i+1]);
    
    
            resultArr[i] = ((b-a)/6)*(a*a + (a+b)*(a+b) + b*b);
        }
    
    }
    
    double* gpu_run(double * h_bound, double * h_resultArr, int SIZE) {
    
        double * d_bound = NULL;
        cudaMalloc((void **)&d_bound, sizeof(double)*SIZE);
    
        cudaMemcpy(d_bound, h_bound, SIZE*sizeof(double), cudaMemcpyHostToDevice);
    
        double * d_resultArr = NULL;
        cudaMalloc((void **)&d_resultArr, sizeof(double)*(SIZE-1));
    
        int threadsPerBlock = 256;
        int blocksPerGrid =(SIZE + threadsPerBlock - 1) / threadsPerBlock;
        simpsons<<<blocksPerGrid, threadsPerBlock>>>(d_bound, d_resultArr, SIZE);
    
        cudaMemcpy(h_resultArr, d_resultArr, (SIZE-1)*sizeof(double), cudaMemcpyDeviceToHost);
    
        return h_resultArr;
    
    }
    
    int main() {
    
        double step = 0.5, upper = 0, result = 0;
        double * h_bound = NULL, * h_resultArr = NULL;
        int SIZE = 0;
    
        cout << "Enter the upper bound: ";
        cin >> upper;
    
        SIZE = (int)(upper/step + 1); //The number of bounds, which is one more than the number of integration times
    
        h_bound = new double[SIZE];
        h_resultArr = new double[SIZE-1];
    
        for (int i = 0; i < SIZE; i++){
            h_bound[i] = i*step;
        }
    
        clock_t t = clock();
    
        h_resultArr = gpu_run(h_bound, h_resultArr, SIZE);
    
        for (int i = 0; i < SIZE; i++){
            result += h_resultArr[i];
        }
    
        t = clock() - t;
    
        cout << "Calculation is done and took " << ((double)t)/CLOCKS_PER_SEC << " seconds." << endl;
    
        cout << "The integral of x^2 from 0 to " << upper << ", using Simpson's Method is: " << setprecision(10) << result << endl;
    
        return 0;
    }
    $ nvcc -arch=sm_20 -o t495 t495.cu
    $ cuda-memcheck ./t495
    ========= CUDA-MEMCHECK
    Enter the upper bound: 3
    i: 0 lower bound: 0.000000  upper bound: 0.500000
    i: 1 lower bound: 0.500000  upper bound: 1.000000
    i: 2 lower bound: 1.000000  upper bound: 1.500000
    i: 3 lower bound: 1.500000  upper bound: 2.000000
    i: 4 lower bound: 2.000000  upper bound: 2.500000
    i: 5 lower bound: 2.500000  upper bound: 3.000000
    Calculation is done and took 1.08 seconds.
    The integral of x^2 from 0 to 3, using Simpson's Method is: 9
    ========= ERROR SUMMARY: 0 errors
    $
    
相关问题