Nvidia GEForce上的OpenCL示例程序问题

时间:2014-06-04 15:19:12

标签: parallel-processing opencl gpu nvidia

我是OpenCL的新手,并尝试从其中一个教程中运行矩阵向量乘法的示例代码。有两个文件,一个是启动内核的matvec.c,另一个是保存内核函数的matvec.cl。 该计划如下:

#define PROGRAM_FILE "matvec.cl"
#define KERNEL_FUNC "matvec_mult"
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
int main() {
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_int i, err;
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
cl_kernel kernel;
size_t work_units_per_kernel;
float mat[16], vec[4], result[4];
float correct[4] = {0.0f, 0.0f, 0.0f, 0.0f};
cl_mem mat_buff, vec_buff, res_buff;
for(i=0; i<16; i++) {
mat[i] = i * 2.0f;
            }
for(i=0; i<4; i++) {
vec[i] = i * 3.0f;
    correct[0] += mat[i] * vec[i];
    correct[1] += mat[i+4] * vec[i];
    correct[2] += mat[i+8] * vec[i];
    correct[3] += mat[i+12] * vec[i];
}
clGetPlatformIDs(1, &platform, NULL);
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1,
&device, NULL);
context = clCreateContext(NULL, 1, &device, NULL,
NULL, &err);
program_handle = fopen(PROGRAM_FILE, "r");
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size,
program_handle);
fclose(program_handle);
program = clCreateProgramWithSource(context, 1,
(const char**)&program_buffer, &program_size, &err);
free(program_buffer);
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
kernel = clCreateKernel(program, KERNEL_FUNC, &err);
queue = clCreateCommandQueue(context, device, 0, &err);
mat_buff = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(float)*16, mat, &err);
vec_buff = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(float)*4, vec, &err);
res_buff = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(float)*4, NULL, &err);
clSetKernelArg(kernel, 0, sizeof(cl_mem), &mat_buff);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &vec_buff);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &res_buff);
work_units_per_kernel = 4;
clEnqueueNDRangeKernel(queue, kernel, 1, NULL,
&work_units_per_kernel, NULL, 0, NULL, NULL);
clEnqueueReadBuffer(queue, res_buff, CL_TRUE, 0,
sizeof(float)*4, result, 0, NULL, NULL);
if((result[0] == correct[0]) && (result[1] == correct[1])
&& (result[2] == correct[2]) && (result[3] == correct[3])) {
printf("Matrix-vector multiplication successful.\n");
}
else {
printf("Matrix-vector multiplication unsuccessful.\n");
}
clReleaseMemObject(mat_buff);
clReleaseMemObject(vec_buff);
clReleaseMemObject(res_buff);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
system("pause");
return 0;
}

内核函数如下:

#include <CL\cl.h>

__kernel void matvec_mult(__global float4* matrix, __global float4* vector,__global   
float* result) {                       
int i = get_global_id(0);
result[i] = dot(matrix[i], vector[0]);
}

当我运行此示例代码时,它显示&#34; Matrix-vector multiplication notccessful&#34;。我正在使用Visual Studio 2010并尝试调试代码。由简单的for循环生成的正确变量的值&#39;与内核函数生成的结果变量不匹配。有人可以帮我解决这个问题吗?指出我出错的地方,我无法确认是代码问题还是系统问题。提前感谢您的帮助。

其中一个错误:&#34;智能感知:标识符&#34; get_global_id&#34;未定义&#34;。

1 个答案:

答案 0 :(得分:2)

错误代码在<CL/cl.h>

中定义

错误-45是CL_INVALID_PROGRAM_EXECUTABLE。根据{{​​3}},它意味着&#34;没有为程序&#34;成功构建可执行文件。在内核源的第一个原始数据中不必要包含。删除它:

#include <CL\cl.h>

OpenCL C不允许包含常规的C / C ++标头。只能包含符合OpenCL C标准的源文件。

通常,为了看看,OpenCL编译器在构建内核时会产生什么错误,包括这样的源代码:(我从现有代码中获取了代码片段,因此检查变量名称等) < / p>

cl_int ret;

program = clCreateProgramWithSource(
    context, 1, (const char**)&src_file, NULL, &ret);

if(ret != CL_SUCCESS){
    fprintf(stderr, "Error with code %d happened.\n", ret);
}

// Warnings will be treated like errors, this is useful for debug
char build_params[] = {"-Werror"};    
ret = clBuildProgram(program, 0, NULL, build_params, NULL, NULL);

if (ret != CL_SUCCESS)
{
    size_t len = 0;
    char *buffer;

    clGetProgramBuildInfo(program,
        device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);

    buffer = calloc(len, sizeof(char));

    clGetProgramBuildInfo(program,
        device_id, CL_PROGRAM_BUILD_LOG, len, buffer, NULL);

    fprintf(stderr, "%s\n", buffer);

    free(buffer);
}