Question

我刚刚进入CUDA而且我遇到了一个我无法理解的问题。基本上我正在编写一个程序来使用Simpson的方法来数值积分函数f（x）= x ^ 2。我这样做的方法是在每个框之间创建一个边界数组，其区域是用Simpson方法计算的，将该边界数组传递给GPU，让每个处理器找到每个有界框的区域并返回一个结果区域的数组。然后将这些区域相加以获得完整的积分。当我尝试访问GPU上的边界数组时，我的问题出现了。数组很好，在CPU上有适当的值，但在我复制并在GPU上访问它之后，这些值都是无意义的，我找不到原因。我的代码如下，任何帮助都将非常感激。

第一个类是获取用户输入并定义CPU数组的主类。

#include <iostream> //Necessary for std::cout
#include <iomanip>  //Necessary for std::setprecision
#include <ctime>    //Necessary for clock_t

using namespace std;

double* gpu_run(double * h_bound, double * h_resultArr, int SIZE);

int main() {

    double step = 0.5, upper = 0, result = 0;
    double * h_bound = NULL, * h_resultArr = NULL;
    int SIZE = 0;

    cout << "Enter the upper bound: ";
    cin >> upper;

    SIZE = upper/step + 1; //The number of bounds, which is one more than the number of integration times

    h_bound = new double[SIZE];
    h_resultArr = new double[SIZE-1];

    for (int i = 0; i < SIZE; i++){
        h_bound[i] = i*step;
    }

    clock_t t = clock();

    h_resultArr = gpu_run(h_bound, h_resultArr, SIZE);

    for (int i = 0; i < SIZE; i++){
        result += h_resultArr[i];
    }

    t = clock() - t;

    cout << "Calculation is done and took " << ((double)t)/CLOCKS_PER_SEC << " seconds." << endl;

    cout << "The integral of x^2 from 0 to " << upper << ", using Simpson's Method is: " << setprecision(10) << result << endl;

    return 0;
}

此时main方法调用了gpu_run，它是包含进行实际计算的方法的cuda代码。当我使用3的上限作为积分时（因为答案应该是9），并且使用0.5的步长，我得到0,0.5,1,1.5,2,2.5和3的界限，因为我期望。 gpu_run代码是

#include <iostream>
#include <cuda_runtime.h>
#include <cstdio>
#include "gpu_run.h"

using namespace std;

double* gpu_run(double * h_bound, double * h_resultArr, int SIZE) {

    double * d_bound = NULL;
    cudaMalloc((void **)&d_bound, sizeof(double)*SIZE);

    cudaMemcpy(d_bound, h_bound, SIZE, cudaMemcpyHostToDevice);

    double * d_resultArr = NULL;
    cudaMalloc((void **)&d_resultArr, sizeof(double)*(SIZE-1));

    int threadsPerBlock = 256;
    int blocksPerGrid =(SIZE + threadsPerBlock - 1) / threadsPerBlock;
    simpsons<<<blocksPerGrid, threadsPerBlock>>>(d_bound, d_resultArr, SIZE);

    cudaMemcpy(h_resultArr, d_resultArr, SIZE-1, cudaMemcpyDeviceToHost);

    return h_resultArr;

}

在这个程序中，d_ prescript用于指定GPU上存在的数组，存储在h_bound中的值在这里仍然是正确的。最后，我有CUDA方法simpsons的标题，称为

__global__ void simpsons(double * bound, double * resultArr, int SIZE){

    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < SIZE-1){
        double a = bound[i];
        double b = bound[i+1];

        printf("i: %d lower bound: %d  upper bound: %d \n", i, bound[i], bound[i+1]);

        resultArr[i] = ((b-a)/6)*(a*a + (a+b)*(a+b) + b*b); 
    }

}

对于每个处理器，我需要它来访问其各自＆＃34;框的两个边界＆＃34;在函数下并使用simpson方法在这两个边界处计算面积，但是，此方法中绑定数组中的值是无意义值。我究竟做错了什么？我觉得这是一个非常愚蠢的错误，但我似乎无法找到它。

Answer 1

当我更改您的代码以解决@talonmies指出的问题，并将printf格式说明符修改为正确的打印出float / double数量的问题时，我得到的似乎是根据您的描述更正结果：

修改cudaMemcpy操作以包含sizeof(double)
在打印浮点数量时将%d替换为%f
我还添加了一个演员，以消除有关将double转换为int

修正了代码和结果：

$ cat t495.cu
#include <iostream> //Necessary for std::cout
#include <iomanip>  //Necessary for std::setprecision
#include <ctime>    //Necessary for clock_t

using namespace std;

__global__ void simpsons(double * bound, double * resultArr, int SIZE){

    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < SIZE-1){
        double a = bound[i];
        double b = bound[i+1];

        printf("i: %d lower bound: %lf  upper bound: %lf \n", i, bound[i], bound[i+1]);


        resultArr[i] = ((b-a)/6)*(a*a + (a+b)*(a+b) + b*b);
    }

}

double* gpu_run(double * h_bound, double * h_resultArr, int SIZE) {

    double * d_bound = NULL;
    cudaMalloc((void **)&d_bound, sizeof(double)*SIZE);

    cudaMemcpy(d_bound, h_bound, SIZE*sizeof(double), cudaMemcpyHostToDevice);

    double * d_resultArr = NULL;
    cudaMalloc((void **)&d_resultArr, sizeof(double)*(SIZE-1));

    int threadsPerBlock = 256;
    int blocksPerGrid =(SIZE + threadsPerBlock - 1) / threadsPerBlock;
    simpsons<<<blocksPerGrid, threadsPerBlock>>>(d_bound, d_resultArr, SIZE);

    cudaMemcpy(h_resultArr, d_resultArr, (SIZE-1)*sizeof(double), cudaMemcpyDeviceToHost);

    return h_resultArr;

}

int main() {

    double step = 0.5, upper = 0, result = 0;
    double * h_bound = NULL, * h_resultArr = NULL;
    int SIZE = 0;

    cout << "Enter the upper bound: ";
    cin >> upper;

    SIZE = (int)(upper/step + 1); //The number of bounds, which is one more than the number of integration times

    h_bound = new double[SIZE];
    h_resultArr = new double[SIZE-1];

    for (int i = 0; i < SIZE; i++){
        h_bound[i] = i*step;
    }

    clock_t t = clock();

    h_resultArr = gpu_run(h_bound, h_resultArr, SIZE);

    for (int i = 0; i < SIZE; i++){
        result += h_resultArr[i];
    }

    t = clock() - t;

    cout << "Calculation is done and took " << ((double)t)/CLOCKS_PER_SEC << " seconds." << endl;

    cout << "The integral of x^2 from 0 to " << upper << ", using Simpson's Method is: " << setprecision(10) << result << endl;

    return 0;
}
$ nvcc -arch=sm_20 -o t495 t495.cu
$ cuda-memcheck ./t495
========= CUDA-MEMCHECK
Enter the upper bound: 3
i: 0 lower bound: 0.000000  upper bound: 0.500000
i: 1 lower bound: 0.500000  upper bound: 1.000000
i: 2 lower bound: 1.000000  upper bound: 1.500000
i: 3 lower bound: 1.500000  upper bound: 2.000000
i: 4 lower bound: 2.000000  upper bound: 2.500000
i: 5 lower bound: 2.500000  upper bound: 3.000000
Calculation is done and took 1.08 seconds.
The integral of x^2 from 0 to 3, using Simpson's Method is: 9
========= ERROR SUMMARY: 0 errors
$

在CUDA中将变量复制到GPU内存

1 个答案: