Question

当我尝试将cl_float2值的数组复制到常量内存时，它不能像Nvidia平台那样工作，.y部分似乎为零。我没有为AMD和Intel平台解决这个问题。

// Host
c_Quadrature_Filter_1 = clCreateBuffer(context, CL_MEM_READ_ONLY, FILTER_SIZE * FILTER_SIZE * sizeof(cl_float2), NULL, &createBufferErrorQuadratureFilter1);

cl_float2* filter_temp = (cl_float2*)malloc(FILTER_SIZE * FILTER_SIZE * sizeof(cl_float2));
cl_float2 test;
test.s[0] = 3.0f;
test.s[1] = 13.0f;

for (int xx = 0; xx < FILTER_SIZE; xx++)
{
    for (int yy = 0; yy < FILTER_SIZE; yy++)
    {
        filter_temp[xx + yy * FILTER_SIZE].s[0] = test.s[0];
        filter_temp[xx + yy * FILTER_SIZE].s[1] = test.s[1];                        
    }
}

clEnqueueWriteBuffer(commandQueue, c_Quadrature_Filter_1, CL_TRUE, 0, FILTER_SIZE * FILTER_SIZE * sizeof(cl_float2), filter_temp, 0, NULL, NULL);
free(filter_temp);

//Device
__kernel(__global float2* Filter_Response, __constant float2* c_Quadrature_Filter_1, __private int DATA_W, __private int DATA_H, __private int DATA_D)
{
    int x = get_global_id(0);
    int y = get_global_id(1);
    int z = get_global_id(2);

    Filter_Response[Calculate3DIndex(x,y,z,DATA_W,DATA_H)].y = c_Quadrature_Filter_1[0].y;
}

Answer 1

我可能错了，但我从来没有得到矢量类型数学来处理GTX680和GTX260。因此，像FFT这样的一些AMD库不能与NVIDIA卡配合使用，但在AMD和Intel硬件上运行良好。 NVIDIA似乎在OpenCL方面落后。

另一个需要注意的是OpenCL设备的最佳矢量长度，以便最好地利用资源。例如，我的ATI7990浮动首选矢量长度为1，而我的i5的矢量长度为8.因此，为了充分利用i5，我将使用float8来最大限度地使用SIMD。

要检查首选矢量长度，请使用clGetDeviceInfo和CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT选项。

编辑：

糟糕，显示了原始矢量宽度，但它与首选矢量宽度相同。

enter image description here

将cl_float2复制到不适用于Nvidia平台的常量内存

1 个答案: