Question

所以我试图在Windows上使用带有Go的cgo的CUDA Runtime API。我已经在这里工作了几天而且卡住了：我得到了一个未定义的引用我的内核包装器。

我已将我的内核和它的包装分离到以下

中

文件：cGo.cuh

typedef unsigned long int ktype;
typedef unsigned char glob;

/*
function Prototypes
*/

extern "C" void kernel_kValid(int, int, ktype *, glob *);

__global__ void kValid(ktype *, glob *);

文件：cGo.cu

#include "cGo.cuh"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_runtime.h"

//function Definitions

/*
kernel_kValid is a wrapper function for the CUDA Kernel to be called from Go
*/
extern "C" void kernel_kValid(int blocks, int threads, ktype *kInfo, glob *values) {
    kValid<<<blocks, threads>>>(kInfo, values);//execute the kernel
}


/*
kValid is the CUDA Kernel which is to be executed
*/
__global__ void kValid(ktype *kInfo, glob *values) {
    //lots of code
}

我将CUDA源代码编译成共享库：

nvcc -shared -o myLib.so cGo.cu

然后我创建了一个包含在我的cgo中的头文件

文件：cGo.h

typedef unsigned long int ktype;
typedef unsigned char glob;

/*
function Declarations
*/

void kernel_kValid(int , int , ktype *, glob *);

然后从go包中我利用cgo调用我的内核包装器

package cuda
/*
#cgo LDFLAGS: -LC:/Storage/Cuda/lib/x64 -lcudart //this is the Cuda library
#cgo LDFLAGS: -L${SRCDIR}/lib -lmyLib //this is my shared library
#cgo CPPFLAGS: -IC:/Storage/Cuda/include //this contains cuda headers
#cgo CPPFLAGS: -I${SRCDIR}/include //this contains cGo.h

#include <cuda_runtime.h>
#include <stdlib.h>
#include "cGo.h"
*/
import "C"

func useKernel(){
//other code
C.kernel_kValid(C.int(B), C.int(T), unsafe.Pointer(storageDevice), unsafe.Pointer(globDevice))
cudaErr, err = C.cudaDeviceSynchronize()
//rest of the code
}

因此，对CUDA运行时API的所有调用都不会抛出错误，它只是我的内核包装器。这是我使用go构建cuda包时的输出。

C:\Users\user\Documents\Repos\go\cuda_wrapper>go build cuda_wrapper\cuda
# cuda_wrapper/cuda
In file included from C:/Storage/Cuda/include/host_defines.h:50:0,
                 from C:/Storage/Cuda/include/device_types.h:53,
                 from C:/Storage/Cuda/include/builtin_types.h:56,
                 from C:/Storage/Cuda/include/cuda_runtime.h:86,
                 from C:\Go\workspace\src\cuda_wrapper\cuda\cuda.go:12:
C:/Storage/Cuda/include/crt/host_defines.h:84:0: warning: "__cdecl" redefined
 #define __cdecl

<built-in>: note: this is the location of the previous definition
# cuda_wrapper/cuda
C:\Users\user\AppData\Local\Temp\go-build038297194\cuda_wrapper\cuda\_obj\cuda.cgo2.o: In function `_cgo_440ebb0a3e25_Cfunc_kernel_kValid':
/tmp/go-build\cuda_wrapper\cuda\_obj/cgo-gcc-prolog:306: undefined reference to `kernel_kValid'
collect2.exe: error: ld returned 1 exit status

就在这里，我不确定是什么问题。我一直在查询有关cgo未定义引用的问题，但我发现的任何问题都没有解决我的问题。我一直在研究CUDA运行时API是用C ++编写的，如果这会影响cgo如何编译它，但我还没有找到任何结论。在这一点上，我认为自己比其他任何事情更困惑，所以我希望有更多知识渊博的人能指出我正确的方向。

Answer 1

良好的名称管理。

这是我们用于gorgonia的解决方案：

#include <math.h>

#ifdef __cplusplus
extern "C" {
#endif


__global__ void sigmoid32(float* A, int size)
{
    int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
    int idx = blockId * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
    if (idx >= size) {
        return;
    }
    A[idx] = 1 / (1 + powf((float)(M_E), (-1 * A[idx])));
}

#ifdef __cplusplus
}
#endif

所以...只需将您的内核包装函数包装在extern "C"

中

CUDA内核包装器的共享库未定义引用

1 个答案: