Question

下面的代码应该执行矩阵矩阵乘法。使用OpenCL。执行代码时，结果矩阵C填充为零。这是不正确的，因为我将两个对角矩阵与非零对角项相乘。

/*
Taken from http://gpgpu-computing4.blogspot.com/2009/09/matrix-multiplication-2-opencl.html

We have made some modifications, but need to understand what is going on.
*/

// Multiply two matrices A * B = C

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <CL\cl.h>
#include"OpenCLUtils.h"

/* WA : Width of A, HA : Width of A etc*/

#define WA 1024
#define HA 1024
#define WB 1024
#define LOCAL_WORK_SIZE 16
#define HB WA
#define WC WB
#define HC HA


void identityMultipliedByCoeff(float* data, int width, float coeff)
{
    for (int i = 0; i <width*width; ++i)
    {
        data[i] = 0.0;
    }
    for (int i = 0; i < width; ++i)
    {
        data[i*width + i] = 1.0*coeff;
    }
}

/////////////////////////////////////////////////////////
// Program main
/////////////////////////////////////////////////////////

int main(int argc, char** argv)
{

    // set seed for rand()
    srand(2006);

    // 1. allocate host memory for matrices A and B
    unsigned int size_A = WA * HA;
    unsigned int mem_size_A = sizeof(float) * size_A;
    float* h_A = (float*)malloc(mem_size_A);

    unsigned int size_B = WB * HB;
    unsigned int mem_size_B = sizeof(float) * size_B;
    float* h_B = (float*)malloc(mem_size_B);

    // 2. initialize host memory
    identityMultipliedByCoeff(h_A, WA, 12.0);
    identityMultipliedByCoeff(h_B, WB, -9.0);

    // 4. allocate host memory for the result C
    unsigned int size_C = WC * HC;
    unsigned int mem_size_C = sizeof(float) * size_C;
    float* h_C = (float*)malloc(mem_size_C);

    // 5. Initialize OpenCL
    // OpenCL specific variables
    cl_device_id device;
    cl_context clGPUContext;
    cl_command_queue clCommandQue;
    cl_program clProgram;
    cl_kernel clKernel;

    size_t dataBytes;
    size_t kernelLength;
    cl_int errcode;

    // OpenCL device memory for matrices
    cl_mem d_A;
    cl_mem d_B;
    cl_mem d_C;

    /*****************************************/
    /* Initialize OpenCL */
    /*****************************************/
    /* Create a device and context */
    device = create_device();
    clGPUContext = clCreateContext(NULL, 1, &device, NULL, NULL, &errcode);
    if (errcode < 0) {
        perror("Couldn't create a context");
        exit(1);
    }

    // get the list of GPU devices associated 
    // with context
    errcode = clGetContextInfo(clGPUContext,
        CL_CONTEXT_DEVICES, 0, NULL,
        &dataBytes);
    cl_device_id *clDevices = (cl_device_id *)
        malloc(dataBytes);
    errcode |= clGetContextInfo(clGPUContext,
        CL_CONTEXT_DEVICES, dataBytes,
        clDevices, NULL);




    //Create a command-queue
    clCommandQue = clCreateCommandQueue(clGPUContext,clDevices[0], 0, &errcode);

    // Setup device memory
    /* We are passing the host memory as an argument. This is where
        the device memory obtains the data from the host memory. */
    d_C = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size_C, NULL, &errcode);
    d_A = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,mem_size_A, h_A, &errcode);
    d_B = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,mem_size_B, h_B, &errcode);

    // 6. Load and build OpenCL kernel

    // Open the .cl file and load it
    // into a char* buffer
    FILE* fp = fopen("MatMul.cl", "r");
    fseek(fp, 0, SEEK_END);
    const size_t lSize = ftell(fp);
    rewind(fp);
    unsigned char* buffer;
    buffer = (unsigned char*)malloc(lSize);
    fread(buffer, 1, lSize, fp);
    fclose(fp);

    cl_int status;
    clProgram = clCreateProgramWithBinary(clGPUContext,
        1, (const cl_device_id *)clDevices,
        &lSize, (const unsigned char**)&buffer,
        &status, &errcode);

    errcode = clBuildProgram(clProgram, 0, NULL, NULL,
        NULL, NULL);

    errcode = clBuildProgram(clProgram, 0,
        NULL, NULL, NULL, NULL);

    clKernel = clCreateKernel(clProgram,
        "matrixMul", &errcode);

    // 7. Launch OpenCL kernel
    size_t localWorkSize[2], globalWorkSize[2];

    int wA = WA;
    int wC = WC;

    /* Set the arguments for the kernel. */
    errcode = clSetKernelArg(clKernel, 0,sizeof(cl_mem), (void *)&d_C);
    errcode |= clSetKernelArg(clKernel, 1,sizeof(cl_mem), (void *)&d_A);
    errcode |= clSetKernelArg(clKernel, 2,sizeof(cl_mem), (void *)&d_B);
    errcode |= clSetKernelArg(clKernel, 3,sizeof(int), (void *)&wA);
    errcode |= clSetKernelArg(clKernel, 4,sizeof(int), (void *)&wC);

    /* Rember that in OpenCL we need to express the
        globalWorkSize in terms of the total number of threads.
        The underlying OpenCL API will look at the 
        globalWorkSize and divide by the localWorkSize to 
        arrive at a 64 by 64 NDRange of 16 by 16 work groups. */

    localWorkSize[0] = LOCAL_WORK_SIZE;
    localWorkSize[1] = LOCAL_WORK_SIZE;
    globalWorkSize[0] = WA;
    globalWorkSize[1] = HA;

    errcode = clEnqueueNDRangeKernel(clCommandQue,clKernel, 2, NULL, globalWorkSize,localWorkSize, 0, NULL, NULL);


    //
    // The calculation has now been carried out
    //


    // 8. Retrieve result from device
    errcode = clEnqueueReadBuffer(clCommandQue,
        d_C, CL_TRUE, 0, mem_size_C,
        h_C, 0, NULL, NULL);

    // We must check the result

    for (int i = 0; i < WA; i++)
    {
        for (int j = 0; j < WA; j++)
        {
            float prod = 0;
            for (int k = 0; k < WA;k++)
            {
                prod += h_A[i*WA + k] * h_B[k*WA + j];
            }
            if (fabs(h_C[i*WA+j] - prod) > 0.01)
            {
                printf("The indices where the comparison failed, i = %d, j = %d\n", i,j);
                printf("C[i*WA+j] should equal %f\n", prod);
                printf("C[i*WA+j] = %f\n", h_C[i*WA + j]);
                perror("The matrix check has failed");
                exit(1);
                break;
            }

        }
    }
    printf("The matrix check has been successfull!\n");


    // 10. clean up memory
    free(h_A);
    free(h_B);
    free(h_C);

    clReleaseMemObject(d_A);
    clReleaseMemObject(d_C);
    clReleaseMemObject(d_B);

    free(clDevices);
    clReleaseContext(clGPUContext);
    clReleaseKernel(clKernel);
    clReleaseProgram(clProgram);
    clReleaseCommandQueue(clCommandQue);

}

头文件OpenCLUtils.h的.cpp文件给出为：

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include<CL\cl.h>

cl_device_id create_device() {

    cl_platform_id* platforms;
    cl_uint num_platforms,num_devices;
    cl_device_id* devices;
    char platform_name_data[50];
    char name_data[50];
    cl_int i,err,platformchoice;

    platformchoice =1;

    /* Find out how many platforms there are */
    err = clGetPlatformIDs(1, NULL, &num_platforms);
    if (err < 0) {
        perror("Couldn't identify a platform");
        exit(1);
    }

    /* Reserve memory for platforms*/
    platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);

    /* Obtain the available platforms and store them in the array platforms */
    clGetPlatformIDs(num_platforms, platforms, NULL);

    /* We want to know the names of the platforms.
    This will the inform us and lead to a
    cannonical choice for 'platformchoice'.*/

    for (i = 0; i < num_platforms; i++)
    {
        err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(platform_name_data), platform_name_data, NULL);
        if (err < 0)
        {
            perror("Unable to obtain information about platform");
        }
        printf("%s\n", platform_name_data);
    }

    printf("\nSearching %s for available devices...\n", platform_name_data);

    /* Obtain the number of GPUS available on this platform */
    err = clGetDeviceIDs(platforms[platformchoice], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
    if (err == CL_DEVICE_NOT_FOUND) 
    {
        perror("No GPU devices available");
        exit(1);
    }
    if (err < 0) {
        perror("Could not access any devices. Not as a result of the device not being found. Debug for error code");
        exit(1);
    }
    /* Reserve memory for devices */
    devices = (cl_device_id*)malloc(sizeof(cl_device_id)*num_devices);

    /* Populate devices with devices compatible with the chosen platform */
    clGetDeviceIDs(platforms[platformchoice], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL);

    for (i = 0; i < num_devices; i++)
    {
        err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(name_data), name_data, NULL);
        if (err < 0)
        {
            perror("Unable to obtain information about device");
        }
        printf("%s\n", name_data);
    }
    printf("\n");
    return devices[0];
}

我正在使用AMD GPU。它是R9380。上面的“实用程序”代码允许我选择平台，然后选择具有支持该平台的设备。无论我选择英特尔平台还是AMD平台，结果都是一样的。

矩阵乘法内核在

中给出

http://gpgpu-computing4.blogspot.com/2009/10/matrix-multiplication-3-opencl.html

VisualStudio调试器的打印屏幕为

enter image description here

它表明状态等于-42，这意味着根据https://streamhpc.com/blog/2013-04-28/opencl-error-codes/的CL_INVALID_BINARY

编辑：

我尝试过

...
// 6. Load and build OpenCL kernel

// Obtain size of source file
FILE* fp = fopen("MatMul.cl", "r");
fseek(fp, 0, SEEK_END);
const size_t lSize = ftell(fp);
rewind(fp);

// Read file content into buffer
unsigned char* buffer = (unsigned char*)malloc(lSize+1);
buffer[lSize] = '\0';
fread(buffer, sizeof(char), lSize, fp);
fclose(fp);

//create program from buffer
clProgram = clCreateProgramWithSource(clGPUContext,1,(const char**)&buffer,&lSize, &errcode);

errcode = clBuildProgram(clProgram, 1,&device, NULL,NULL, NULL);

clKernel = clCreateKernel(clProgram,
    "matrixMul", &errcode);
...

但是现在获取错误代码-11 CL_BUILD_PROGRAM _FAILURE

最终编辑：

我开始工作

这是校正后的内核

/* Matrix multiplication: C = A * B.
 * Device code.
 */

// Thread block size
#define BLOCK_SIZE 16

//////////////////////////////////////////////////////
//! Matrix multiplication on the device: C = A * B
//! wA is A's width and wB is B's width
//////////////////////////////////////////////////////
__kernel void
matrixMul(__global float* C, 
          __global float* A, 
          __global float* B, int wA, int wB)
{

    float Csub=0;   <<<<<<< THIS WAS MISSING IN THE BLOG

    // Block index
    int bx = get_group_id(0);
    int by = get_group_id(1);

    // Thread index
    int tx = get_local_id(0);
    int ty = get_local_id(1);

    // Index of the first sub-matrix of A processed 
    // by the block
    int aBegin = wA * BLOCK_SIZE * by;

    // Index of the last sub-matrix of A processed 
    // by the block
    int aEnd   = aBegin + wA - 1;

    // Step size used to iterate through the 
    // sub-matrices of A
    int aStep  = BLOCK_SIZE;

    // Index of the first sub-matrix of B processed 
    // by the block
    int bBegin = BLOCK_SIZE * bx;

    // Step size used to iterate through the 
    // sub-matrices of B
    int bStep  = BLOCK_SIZE * wB;

    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for (int a = aBegin, b = bBegin;
             a <= aEnd;
             a += aStep, b += bStep) 
    {

        // Declaration of the local memory array As 
        // used to store the sub-matrix of A
        __local float As[BLOCK_SIZE][BLOCK_SIZE];

        // Declaration of the local memory array Bs 
        // used to store the sub-matrix of B
        __local float Bs[BLOCK_SIZE][BLOCK_SIZE];

        // Load the matrices from global memory
        // to local memory; each thread loads
        // one element of each matrix
        As[ty][tx] = A[a + wA * ty + tx];
        Bs[ty][tx] = B[b + wB * ty + tx];

        // Synchronize to make sure the matrices 
        // are loaded
        barrier(CLK_LOCAL_MEM_FENCE);

        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
        for (int k = 0; k < BLOCK_SIZE; ++k)
            Csub += As[ty][k] * Bs[k][tx];

        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        barrier(CLK_LOCAL_MEM_FENCE);

    }

    // Write the block sub-matrix to device memory;
    // each thread writes one element
    int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
    C[c + wB * ty + tx] = Csub;

}

Answer 1

我认为错误在于这一行：

 d_C = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size_A, NULL, &errcode);

第三个参数应为mem_size_C

编辑：此外，出于性能考虑，我建议您使用clEnqueueWriteBuffer在矩阵A和B上进行复制。进一步阅读here

矩阵乘法返回错误的结果OpenCL

1 个答案: