Question

本周末我正在努力学习Cuda。

我想要做的是c = a + b。变量（a，b和c）中的每一个都是一个包含5个元素的数组。

我的结果有问题。这是我想要的结果：

tbl_front_post

但这就是我得到的：

{a1, a2, a3, a4, a5} = {11.000000, 21.000000, 31.000000, 41.000000, 51.000000}
{b1, b2, b3, b4, b5} = {1.000000, 3.000000, 5.000000, 7.000000, 11.000000}
{c1, c2, c3, c4, c5} = {12.000000, 24.000000, 36.000000, 48.000000, 62.000000}

如您所见，结果（c3，c4，c5）错误。
请告诉我如何使下面的代码做正确的事。

我正在使用VS2015和Cuda工具包8。我在项目解决方案中创建了3个文件：main.cpp，simple_math.cu，simple_math.cuh;

的main.cpp

PS E:\testing\cuda2\Debug> .\cuda2.exe
{a1, a2, a3, a4, a5} = {11.000000, 21.000000, 31.000000, 41.000000, 51.000000}
{b1, b2, b3, b4, b5} = {1.000000, 3.000000, 5.000000, 7.000000, 11.000000}
{c1, c2, c3, c4, c5} = {12.000000, 24.000000, 0.000000, 0.000000, 0.000000}

simple_math.cuh

#include "simple_math.cuh"
#include <iostream> // fprintf


int main()
{
    const int arraySize = 5;
    float a[arraySize] = { 11, 21, 31, 41, 51 };
    float b[arraySize] = { 1, 3, 5, 7, 11 };
    double c[arraySize] = { 0, 0, 0, 0, 0 };

    cudaError_t cudaStatus = mathWithCuda(c, a, b, arraySize, ADD);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "mathWithCuda failed!");
        return 1;
    }


    fprintf(stdout, "{a1, a2, a3, a4, a5} = {%f, %f, %f, %f, %f} \n{b1, b2, b3, b4, b5} = {%f, %f, %f, %f, %f} \n{c1, c2, c3, c4, c5} = {%f, %f, %f, %f, %f}",
        a[0], a[1], a[2], a[3], a[4], b[0], b[1], b[2], b[3], b[4], c[0], c[1], c[2], c[3], c[4]);


    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

simple_math.cu

#ifndef SIMPLE_MATH_CUH_
#define SIMPLE_MATH_CUH_


#include <cuda_runtime.h> // cudaError_t

#define ADD 0
#define SUB 1
#define MUL 2
#define DIV 3


cudaError_t mathWithCuda(double *c, const float *a, const float *b, unsigned int size, int mode);

__global__ void addKernel(double *c, const float *a, const float *b);
__global__ void subKernel(double *c, const float *a, const float *b);
__global__ void mulKernel(double *c, const float *a, const float *b);
__global__ void divKernel(double *c, const float *a, const float *b);


#endif

Answer 1

问题似乎在这里：

cudaStatus = cudaMemcpy(c, dev_c, arraySize * sizeof(float), cudaMemcpyDeviceToHost);

我认为你应该复制arraySize * sizeof(double)咬人。

我的Cuda脚本数组输出错误

1 个答案: