Question

我是OpenCL的初学者。我试图建立一个简单的应用程序，只需添加2个向量来获得结果。这是我的以下主机代码

#define USE_PLATFORM 0
#define USE_DEVICE 2
#define DATA_SIZE 1024

#define USE_KERNEL_PATH "/Users/huangxin/Documents/August13Programming/FirstEGOpenCL/FirstEGOpenCL/kernel.cl"

using namespace std;

int main(int argc, const char * argv[]) {
    int err;
    cl_uint numPlatforms;
    cl_uint numDevices;
    cl_command_queue command;
    size_t global;

    //Query the number of platforms supported.
    err = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (err != CL_SUCCESS || USE_PLATFORM >= numPlatforms)
    {
        printf("Error at: clGetPlatformIDs(querying platforms count failed):\n");
        exit(-1);
    }

    //Get all platforms.
    vector<cl_platform_id> platforms(numPlatforms);
    err = clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms);
    if (err != CL_SUCCESS)
    {
        printf("Error at: clGetPlatformIDs(getting all platforms failed):\n");
        exit(-1);
    }

    //Query the number of devices supported by the platform spicified.
    err = clGetDeviceIDs(platforms[USE_PLATFORM], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
    if (err != CL_SUCCESS || USE_PLATFORM >= numDevices)
    {
        printf("Error at: clGetDeviceIDs(querying devices count failed):\n");
        exit(-1);
    }

    //Get all devices.
    vector<cl_device_id> devices(numDevices);
    err=clGetDeviceIDs(platforms[USE_PLATFORM], CL_DEVICE_TYPE_ALL, numDevices, &devices[0], &numDevices);
    if (err != CL_SUCCESS)
    {
        printf("Error at: clGetDeviceIDs(getting all devices failed):\n");
        exit(-1);
    }


    //Get device infomation.
    char deviceInfo[1024];
    //get device max work item dimensions.
    size_t maxItemSize[3];
    clGetDeviceInfo(devices[USE_DEVICE], CL_DEVICE_NAME, sizeof(deviceInfo)*1024, deviceInfo, NULL);
    clGetDeviceInfo(devices[USE_DEVICE], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, maxItemSize, NULL);
    cout << "Device selected: " << deviceInfo << endl;
    cout << "Max item size: " << maxItemSize[0] << "," << maxItemSize[1] << ","<< maxItemSize[2] << endl;

    //Set property with certain platform
    cl_context_properties prop[] = {CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platforms[USE_PLATFORM]), 0};

    //create context with certain property.
    cl_context context = clCreateContextFromType(prop, CL_DEVICE_TYPE_ALL, NULL, NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error at: clCreateContextFromType(get context failed):\n");
        exit(-1);
    }

    //create command queue using selected device and context.
    command = clCreateCommandQueue(context, devices[USE_DEVICE], 0, NULL);

    //create program with specified kernel source.
    const char *kernelSource = getKernelSource(USE_KERNEL_PATH);
    cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, 0, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error at: clCreateProgramWithSource(get program failed):\n");
        exit(-1);
    }

    //since OpenCL is a dynamic-compile architechture, we need to build the program.
    err = clBuildProgram(program, 0, 0, 0, 0, 0);
    if (err != CL_SUCCESS)
    {
        cout << err << endl;
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, devices[USE_DEVICE], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }

    //kernel是OpenCL中对执行在一个最小粒度的compute item上的代码及参数的抽象
    //create the kernel function using the built program.
    cl_kernel adder = clCreateKernel(program, "adder", &err);
    if (err != CL_SUCCESS)
    {
        printf("Error at: clCreateKernel(get kernel function failed):\n");
        exit(-1);
    }

    //create the vector of input random data.
    vector<float> inA(DATA_SIZE), inB(DATA_SIZE);
    for(int i = 0; i < DATA_SIZE; i++) {
        inA[i] = (float)(random() % DATA_SIZE) / 1000;
        inB[i] = (float)(random() % DATA_SIZE) / 1000;
    }

    //create the read-only device mem using specified context, that is to copy the host mem to the device mem.
    cl_mem cl_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * DATA_SIZE, &inA[0], NULL);
    cl_mem cl_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * DATA_SIZE, &inB[0], NULL);
    //create the result mem.
    cl_mem cl_res = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * DATA_SIZE, NULL, NULL);

    //setting up the arguement of kernel memory
    clSetKernelArg(adder, 0, sizeof(cl_mem), &cl_a);
    clSetKernelArg(adder, 1, sizeof(cl_mem), &cl_b);
    clSetKernelArg(adder, 2, sizeof(cl_mem), &cl_res);

    START_CHECK_RUNNING_TIME
    //enqueue the kernel into the specified command(#TODO:come back later to check the remaining arguement.
    global = DATA_SIZE;
    err = clEnqueueNDRangeKernel(command, adder, 1, 0, &global, 0, 0, 0, 0);
    if (err != CL_SUCCESS)
    {
        printf("Error at: clEnqueueNDRangeKernel(enqueue kernel failed):\n");
        exit(-1);
    }

    printf("*****************FLAG***************");

    //copy the results from the kernel into the host(CPU).
    vector<float> res(DATA_SIZE);
     err = clEnqueueReadBuffer(command, cl_res, CL_TRUE, 0, sizeof(float) * DATA_SIZE, &res[0], 0, 0, 0);
    END_CHECK_RUNNING_TIME

    //check the number of right compute.
    int cnt = 0;
    for (int i = 0; i < res.size(); i++) {
        cnt += (res[i] == inA[i] + inB[i] ? 1 : 0);
    }
    cout << "Computed " << res.size() << " values\n";
    cout << "Correct values:(" << cnt << "/" << res.size() << "),correct rate:" << (float)cnt / res.size() * 100 << "%" << endl;

    gettimeofday(&sTime, NULL);
    for (int i = 0; i < res.size(); i++) {
        for (int j = 0; j < 10000; j++)
            res[i] = inA[i] + inB[i];
    }
    gettimeofday(&eTime, NULL);timeuse = 1000000 * ( eTime.tv_sec - sTime.tv_sec ) + eTime.tv_usec -sTime.tv_usec; printf("Running time: %fs\n", (double)timeuse/(1000000));

    //cleaning up the variables.
    clReleaseKernel(adder);
    clReleaseProgram(program);
    clReleaseMemObject(cl_a);
    clReleaseMemObject(cl_b);
    clReleaseMemObject(cl_res);
    clReleaseCommandQueue(command);
    clReleaseContext(context);
    return 0;
}

这是一个有点长的代码，但它真的很简单。这是我的内核代码

kernel void adder(global const float* a, global const float* b, global float* result)
{
    size_t idx = get_global_id(0);
    for (int i = 0; i < 10000; i++)
        result[idx] = a[idx] +b[idx];
}

我得到了以下结果：

Device selected: GeForce GT 650M
-11
Error: Failed to build program executable!
No kernels or only kernel prototypes found.

我不太明白“没有内核或仅发现内核原型”。意思是，如果我使用第一个设备（CPU）或我的第二个设备（HD Graphics 4000），那么相同的代码运行得非常完美。

我想知道出了什么问题以及为什么会这样。

我在使用Mac OS X 10.10的Xcode中运行这些代码。

Answer 1

正如评论所说，使用是一种很好的做法：

__kernel void adder(__global const float* a, __global const float* b, __global float* result)

因为这样你就可以清楚地定义那些特殊的CL标志。通常，所有CL内核都遵循该规则，即使规范允许两者。

但您的问题可能是由于在设备列表中运行clBuildProgram() w 而不是任何设备。因此，根本不编译任何东西！

在CL中，每个设备都有一个特定的编译器（CPU没有与GPU相同的编译器，有时甚至没有相同的指令集）。因此，您应该为API提供必须为其编译内核的设备列表。

正确的方法是：

err = clBuildProgram(program, 1, &devices[USE_DEVICE], "", 0, 0);

注意：我添加了“”，因为将来你可能想要添加一些构建参数，最好准备好它：）

clBuildProgram失败并显示错误：无法构建程序可执行文件

1 个答案: