OpenCL - 平铺矩阵乘法 - 问题__local

时间:2015-07-10 05:03:50

标签: opencl matrix-multiplication

我已经完成了关于Cuda的课程,现在我试图通过将我的代码从类移植到openCL来学习一些openCL,但是我在OpenCL上使用本地内存时遇到了麻烦。下面的内核输出错误的结果,而具有相同主机代码的更简单的内核输出正确的答案。我曾尝试使用1D工作组等重写内核,但结果却相同。我正在从一个名为“matrixMultTiled.cl”的单独文件中加载内核。

内核代码:

#define tW 16

__kernel void matrixMult(int HeightA,int WidthA, int WidthB,
                         __global float *A, __global float *B,
                         __global float *C)
{                                          
  int t,k;
  float sum;
  int  tx = get_local_id(1),   ty = get_local_id(0);
  int row = get_global_id(1), col = get_global_id(0);

  __local float sA[tW][tW],sB[tW][tW];

  sum=0.;
  for(t=0; t < (WidthA-1)/tW+1; t+=1){
    if( (row<HeightA) && (t*tW+tx<WidthA) )
      sA[ty][tx] = A[row*WidthA+(t*tW+tx)];
    else
      sA[ty][tx] = 0.;

    if( (t*tW+ty<WidthA) && (col<WidthB) )
      sB[ty][tx] = B[(t*tW+ty)*WidthB+col];
    else
      sB[ty][tx] = 0.;

    barrier(CLK_LOCAL_MEM_FENCE); 

    for(k=0;k<tW;k+=1)
      sum += sA[ty][k]*sB[k][tx];

    barrier(CLK_LOCAL_MEM_FENCE); 
  }

  if((row<HeightA)&&(col<WidthB))
    C[row*WidthB+col] = sum;  
}

简单内核代码:

__kernel void matrixMult(int HeightA,int WidthA, int WidthtB,                   
                         __global float *A, __global float *B, 
                         __global float *C)             
{                                          
  int k;
  float sum;

  int row = get_global_id(1);              
  int col = get_global_id(0);              

  if((row<=HeightA)&&(col<=WidthtB)){
    sum=0.0f;
    for(k=0;k<WidthA;k+=1)
      sum += A[row*WidthA+k]*B[k*WidthtB+col];
    C[row*WidthtB+col] = sum;
  }
}

主机代码:

#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <stdarg.h>
#include <stdbool.h>
#include <CL/cl.h>

#define DEBUG_MODE false
#define clCheck(stmt) if((stmt)!= CL_SUCCESS){printf("%s\n",getErrorString(stmt));}
#define clExec(expr) {status=expr clCheck(status); if(status!=CL_SUCCESS){return status;}}
#define clGuard(expr) {expr clCheck(status); if(status!=CL_SUCCESS){return status;}}

const char *getErrorString(cl_int error);

char* read_file(const char *filename);

int platformInit(cl_uint *numPlatformsOut, cl_uint *numDevicesOut,
                 cl_platform_id **platformsOut, cl_device_id **devicesOut);

int contextQueueInit(cl_uint numDevices, cl_device_id *devices,int dev,
                     cl_context *contextOut, cl_command_queue *cmdQueueOut);

int initSingleKernelProgram(const char *progFile,cl_context context,
                            size_t numDev, cl_device_id* devices,
                            const char *kernelName,cl_program *programOut,
                            cl_kernel *kernelOut);

int main(){
  const char *kernelFile="matrixMultTiled.cl";
  int i,j,k,err,L,M,N;
  int WidthA=10,HeightA=10,WidthB=10;
  size_t offset=0,datasize, kerCount=1,*lenghts=NULL;
  float *hA,*hB,*hC;
  char *programSource;
  bool result=true;
  cl_int status;
  cl_bool blocking;
  cl_uint numPlat, numDev;
  cl_platform_id *platforms;
  cl_device_id *devices;
  cl_context context;
  cl_command_queue queue;
  cl_program program;
  cl_kernel kernel;
  cl_mem dA,dB,dC; 

  L=3;
  M=3;
  N=3;

  datasize = L*M*sizeof(float);
  hA = (float*)malloc(datasize);

  datasize = M*N*sizeof(float);
  hB = (float*)malloc(datasize);

  datasize = L*N*sizeof(float);
  hC = (float*)malloc(datasize);

  for(i=0;i<L;i+=1)
    for(j=0;j<M;j+=1)
      hA[i*M+j] = i+j;

  for(i=0;i<M;i+=1)
    for(j=0;j<N;j+=1)
      hB[i*M+j] = i*j;

  err=platformInit(&numPlat,&numDev,&platforms,&devices);
  if(err!=0){printf("problem in platform inicialization\n");return err;}

  err=contextQueueInit(numDev, devices,0,&context, &queue);
  if(err!=0){printf("problem in context and queue inicialization\n");return err;}

  err = initSingleKernelProgram(kernelFile,context,numDev,
                                devices,"matrixMult",&program,&kernel);
  if(err!=0){printf("problem with program or kernel init\n"); return err;}

  datasize = L*M*sizeof(float);
  clGuard(dA =clCreateBuffer(context,CL_MEM_READ_ONLY,datasize,NULL,&status);) 

  datasize = M*N*sizeof(float);
  clGuard(dB =clCreateBuffer(context,CL_MEM_READ_ONLY,datasize,NULL,&status);) 

  datasize = L*N*sizeof(float);
  clGuard(dC =clCreateBuffer(context,CL_MEM_WRITE_ONLY,datasize,NULL,&status);)

  offset=0;
  blocking = CL_TRUE;

  // last 3 arguments : 0 events in event_list, no wait_list, no event
  datasize = L*M*sizeof(float);
  clExec(clEnqueueWriteBuffer(queue,dA,blocking,offset,datasize,
                                    hA,0,NULL,NULL);); 

  // last 3 arguments : 0 events in event_list, no wait_list, no event
  datasize = M*N*sizeof(float);
  clExec(clEnqueueWriteBuffer(queue,dB,blocking,offset,datasize,
                                    hB,0,NULL,NULL);)

  status  = clSetKernelArg(kernel,0, sizeof(cl_int), &L);
  status |= clSetKernelArg(kernel,1, sizeof(cl_int), &M);
  status |= clSetKernelArg(kernel,2, sizeof(cl_int), &N);
  status |= clSetKernelArg(kernel,3, sizeof(cl_mem), &dA);
  status |= clSetKernelArg(kernel,4, sizeof(cl_mem), &dB);  
  status |= clSetKernelArg(kernel,5, sizeof(cl_mem), &dC); 
  clCheck(status); if(status!=CL_SUCCESS){
    printf("Problem Setting program Arguments\n");
    return status;
  }

  size_t localWS[2],globalWS[2];
  localWS[0] =16;  localWS[1]=16;
  globalWS[0] = ((L/16)+1)*16; globalWS[1]= ((N/16)+1)*16; 

  clExec(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, 
                                globalWS, localWS, 0, NULL, NULL);)

  blocking = CL_TRUE;
  datasize = L*N*sizeof(float);
  clGuard( clEnqueueReadBuffer(queue,dC,blocking,0,datasize,
                                     hC,0,NULL,NULL); )

  // check the result
  printf("result:\n");
  for(i=0;i<L;i+=1){
    for(j=0;j<N;j+=1)
      printf("%f ",hC[i*N+j]);
    printf("\n");
  }

  // Free OpenCL resources
  clReleaseKernel(kernel);
  clReleaseProgram(program);
  clReleaseCommandQueue(queue);
  clReleaseContext(context);
  clReleaseMemObject(dA);
  clReleaseMemObject(dB);
  clReleaseMemObject(dC);

  // Free host resources
  free(hA);
  free(hB);
  free(hC);
  free(platforms);
  free(devices);
  return 0;
}

const char *getErrorString(cl_int error){
  switch(error){
    // run-time and JIT compiler errors
    case 0: return "CL_SUCCESS";
    case -1: return "CL_DEVICE_NOT_FOUND";
    case -2: return "CL_DEVICE_NOT_AVAILABLE";
    case -3: return "CL_COMPILER_NOT_AVAILABLE";
    case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
    case -5: return "CL_OUT_OF_RESOURCES";
    case -6: return "CL_OUT_OF_HOST_MEMORY";
    case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
    case -8: return "CL_MEM_COPY_OVERLAP";
    case -9: return "CL_IMAGE_FORMAT_MISMATCH";
    case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
    case -11: return "CL_BUILD_PROGRAM_FAILURE";
    case -12: return "CL_MAP_FAILURE";
    case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
    case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
    case -15: return "CL_COMPILE_PROGRAM_FAILURE";
    case -16: return "CL_LINKER_NOT_AVAILABLE";
    case -17: return "CL_LINK_PROGRAM_FAILURE";
    case -18: return "CL_DEVICE_PARTITION_FAILED";
    case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";

    // compile-time errors
    case -30: return "CL_INVALID_VALUE";
    case -31: return "CL_INVALID_DEVICE_TYPE";
    case -32: return "CL_INVALID_PLATFORM";
    case -33: return "CL_INVALID_DEVICE";
    case -34: return "CL_INVALID_CONTEXT";
    case -35: return "CL_INVALID_QUEUE_PROPERTIES";
    case -36: return "CL_INVALID_COMMAND_QUEUE";
    case -37: return "CL_INVALID_HOST_PTR";
    case -38: return "CL_INVALID_MEM_OBJECT";
    case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
    case -40: return "CL_INVALID_IMAGE_SIZE";
    case -41: return "CL_INVALID_SAMPLER";
    case -42: return "CL_INVALID_BINARY";
    case -43: return "CL_INVALID_BUILD_OPTIONS";
    case -44: return "CL_INVALID_PROGRAM";
    case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
    case -46: return "CL_INVALID_KERNEL_NAME";
    case -47: return "CL_INVALID_KERNEL_DEFINITION";
    case -48: return "CL_INVALID_KERNEL";
    case -49: return "CL_INVALID_ARG_INDEX";
    case -50: return "CL_INVALID_ARG_VALUE";
    case -51: return "CL_INVALID_ARG_SIZE";
    case -52: return "CL_INVALID_KERNEL_ARGS";
    case -53: return "CL_INVALID_WORK_DIMENSION";
    case -54: return "CL_INVALID_WORK_GROUP_SIZE";
    case -55: return "CL_INVALID_WORK_ITEM_SIZE";
    case -56: return "CL_INVALID_GLOBAL_OFFSET";
    case -57: return "CL_INVALID_EVENT_WAIT_LIST";
    case -58: return "CL_INVALID_EVENT";
    case -59: return "CL_INVALID_OPERATION";
    case -60: return "CL_INVALID_GL_OBJECT";
    case -61: return "CL_INVALID_BUFFER_SIZE";
    case -62: return "CL_INVALID_MIP_LEVEL";
    case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
    case -64: return "CL_INVALID_PROPERTY";
    case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
    case -66: return "CL_INVALID_COMPILER_OPTIONS";
    case -67: return "CL_INVALID_LINKER_OPTIONS";
    case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";

    // extension errors
    case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
    case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
    case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
    case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
    case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
    case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
    default: return "Unknown OpenCL error";
  }
}

char* read_file(const char *filename){
  int err,offset=0;
  size_t size;
  FILE *file = fopen(filename,"r");
  if(file==NULL)
    return NULL;

  err=fseek(file,offset,SEEK_END);
  if(err<0){fclose(file); return NULL;}
  size = ftell(file); // anotating the end of File

  err=fseek(file,offset,SEEK_SET);
  if(err<0){fclose(file); return NULL;}

  char *content = (char*) malloc((size+1)*sizeof(char));
  if(err<0){fclose(file); return NULL;}

  err=fread(content,sizeof(char),size,file);
  if(err<0){fclose(file); free(content); return NULL;}  

  err=fclose(file);
  content[size]='\0';

  return content;
}
int platformInit(cl_uint *numPlatformsOut, cl_uint *numDevicesOut,
                 cl_platform_id **platformsOut, cl_device_id **devicesOut)
{
  cl_int status,i,j;
  cl_uint numPlatforms = 0, numDevices = 0;
  cl_platform_id *platforms = NULL;
  cl_device_id *devices = NULL;

  // first call -- get the number of platforms
  status = clGetPlatformIDs(0,NULL,&numPlatforms); 
  clCheck(status);
  if(status!=CL_SUCCESS){return (status);}

  platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
  if(platforms==NULL){return (-2001);}

  // second call -- get the platforms information  
  status = clGetPlatformIDs(numPlatforms,platforms,NULL);
  clCheck(status);
  if(status!=CL_SUCCESS){free(platforms); return (status);}

  if(DEBUG_MODE==true)
    printf("numPlatforms = %d\n",numPlatforms);

  for(i=0;i<numPlatforms;i+=1){
    char buf[1024+1];
    cl_uint dev_count;

    status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(buf),
                       buf, NULL);
    clCheck(status);
    if(status!=CL_SUCCESS){free(platforms); return (status);}

    if(DEBUG_MODE == true)
      printf("platform %d: vendor '%s'\n",i,buf);

    status = clGetDeviceIDs(platforms[i],CL_DEVICE_TYPE_ALL,0,
                          NULL,&dev_count); 
    clCheck(status);
    if(status!=CL_SUCCESS){free(platforms); return (status);}

    devices = (cl_device_id*)malloc(dev_count*sizeof(cl_device_id));
    if(devices==NULL){free(platforms); return (-2002);}

    // second call -- get the devices information
    status = clGetDeviceIDs(platforms[i],CL_DEVICE_TYPE_ALL,dev_count,
                          devices,NULL); 
    clCheck(status);
    if(status!=CL_SUCCESS){free(platforms); free(devices);return (status);}

    for(j=0;j<dev_count;j+=1){
      char bufDev[1024+1];

      status = clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(bufDev), 
                               bufDev, NULL);
      if(DEBUG_MODE==true)
        printf(" device %d: '%s'\n",j,bufDev);
    }

    if(i==0)
      numDevices=dev_count;
  }

  if(DEBUG_MODE==true)
    printf("%d %d %p %p\n",numPlatforms, numDevices,platforms,devices);

  *numPlatformsOut = numPlatforms;
  *numDevicesOut   = numDevices;
  *platformsOut    = platforms;
  *devicesOut      = devices;

  return 0;
}
int contextQueueInit(cl_uint numDevices, cl_device_id *devices,int dev,
                     cl_context *contextOut, cl_command_queue *cmdQueueOut)
{
  cl_int status;
  cl_context context = NULL;
  cl_command_queue cmdQueue;

  // create a context for given devices
  context = clCreateContext(NULL,numDevices,devices,NULL,NULL,&status);
  clCheck(status); 
  if(status!=CL_SUCCESS){
    if(context==NULL)
      return status;
    else{
      clReleaseContext(context);
      return status;
    }
  }

  // create a command queue in this context for device dev 
  cmdQueue = clCreateCommandQueue(context,devices[dev],0,&status);
  clCheck(status); 
  if(status!=CL_SUCCESS){
    if(context==NULL)
      return status;
    else{
      if(cmdQueue!=NULL){
        clReleaseCommandQueue(cmdQueue);
      }
      clReleaseContext(context);
      return status;
    }
  }

  *contextOut  = context;
  *cmdQueueOut = cmdQueue;

  return 0;
}

int initSingleKernelProgram(const char *progFile,cl_context context,
                            size_t numDev, cl_device_id* devices,
                            const char *kernelName,cl_program *programOut,
                            cl_kernel *kernelOut)
{
  cl_uint status;
  char *programSource=NULL;
  cl_program program=NULL;
  cl_kernel kernel=NULL;

  programSource = read_file(progFile);
  if(programSource==NULL)
    return -2001;

  if(DEBUG_MODE==true)
    printf("\n\n%s\n\n",programSource);

  program = clCreateProgramWithSource(context,1,(const char**)&programSource,
                                      NULL,&status);
  clCheck(status); 
  if(status!=CL_SUCCESS){
    free(programSource);
    return status;
  }

  status = clBuildProgram(program,numDev,devices,NULL,NULL,NULL);
  clCheck(status);
  if(status!=CL_SUCCESS){
    if (status == CL_BUILD_PROGRAM_FAILURE) {
      // Determine the size of the log
      size_t logSize;
      clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);

      // Allocate memory for the log
      char *logStr = (char *) malloc(logSize);

      // Get the log
      clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, logSize, logStr, NULL);

      // Print the log
      printf("%s\n", logStr);
    }

    clReleaseProgram(program);
    return status;
  }
  if(DEBUG_MODE==true){
    // Determine the size of the log
    size_t logSize;
    clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);

    // Allocate memory for the log
    char *logStr = (char *) malloc(logSize);

    // Get the log
    clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, logSize, logStr, NULL);

    // Print the log
    printf("%s\n", logStr);
  }

  kernel = clCreateKernel(program, kernelName,&status);
  clCheck(status);
  if(status!=CL_SUCCESS){
    clReleaseProgram(program);
    if(kernel!=NULL)
      clReleaseKernel(kernel);
    return status;
  }

  *programOut = program;
  *kernelOut = kernel;

  free(programSource);

  return 0;
}

任何亮点或建议都非常受欢迎

0 个答案:

没有答案