在这个作业中,我需要完成代码,使用CUDA C将两个矩形矩阵相乘。完成代码后,我提交了解决方案,当矩阵为正方形时,解决方案对于数据集是正确的,而结果不是当矩阵不是正方形时,匹配期望值。
这是我添加缺失部分后的代码:
#include <wb.h>
#define wbCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
return -1; \
} \
} while(0)
// Compute C = A * B
__global__ void matrixMultiply(float * A, float * B, float * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns) {
//@@ Insert code to implement matrix multiplication here
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if (numAColumns != numBRows) return ;
if ((Row < numARows) && (Col < numBColumns)){
float Cvalue = 0;
for (int k = 0 ; k < numAColumns ; ++k )
Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col];
C[Row*numAColumns + Col] = Cvalue;
}
}
int main(int argc, char ** argv) {
wbArg_t args;
float * hostA; // The A matrix
float * hostB; // The B matrix
float * hostC; // The output C matrix
float * deviceA;
float * deviceB;
float * deviceC;
int numARows; // number of rows in the matrix A
int numAColumns; // number of columns in the matrix A
int numBRows; // number of rows in the matrix B
int numBColumns; // number of columns in the matrix B
int numCRows; // number of rows in the matrix C (you have to set this)
int numCColumns; // number of columns in the matrix C (you have to set this)
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
//@@ Set numCRows and numCColumns
numCRows = 0;
numCColumns = 0;
numCRows = numARows;
numCColumns = numBColumns;
//@@ Allocate the hostC matrix
hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns);
wbTime_stop(Generic, "Importing data and creating memory on host");
wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);
wbTime_start(GPU, "Allocating GPU memory.");
//@@ Allocate GPU memory here
cudaMalloc((void**)&deviceA ,sizeof(float)*numARows*numAColumns );
cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns);
cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns);
wbTime_stop(GPU, "Allocating GPU memory.");
wbTime_start(GPU, "Copying input memory to the GPU.");
//@@ Copy memory to the GPU here
cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice);
cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice);
wbTime_stop(GPU, "Copying input memory to the GPU.");
//@@ Initialize the grid and block dimensions here
dim3 DimGrid(numARows / 8 , numBColumns / 8, 1);
dim3 DimBlock(8 , 8, 1);
wbTime_start(Compute, "Performing CUDA computation");
//@@ Launch the GPU Kernel here
matrixMultiply<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns);
cudaThreadSynchronize();
wbTime_stop(Compute, "Performing CUDA computation");
wbTime_start(Copy, "Copying output memory to the CPU");
//@@ Copy the GPU memory back to the CPU here
cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost);
wbTime_stop(Copy, "Copying output memory to the CPU");
wbTime_start(GPU, "Freeing GPU Memory");
//@@ Free the GPU memory here
cudaFree(deviceA);
cudaFree(deviceB);
cudaFree(deviceC);
wbTime_stop(GPU, "Freeing GPU Memory");
wbSolution(args, hostC, numCRows, numCColumns);
free(hostA);
free(hostB);
free(hostC);
return 0;
}
我希望你能帮助我找出哪个部分不正确。
答案 0 :(得分:4)
在Ira,Ahmad,ram和Oli Fly的帮助下,我得到了正确答案如下:
#include <wb.h>
#define wbCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
return -1; \
} \
} while(0)
// Compute C = A * B
__global__ void matrixMultiply(float * A, float * B, float * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns) {
//@@ Insert code to implement matrix multiplication here
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if (numAColumns != numBRows) return;
if ((Row < numARows) && (Col < numBColumns)){
float Cvalue = 0;
for (int k = 0; k < numAColumns; ++k)
Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col];
C[Row*numCColumns + Col] = Cvalue;
}
}
int main(int argc, char ** argv) {
wbArg_t args;
float * hostA; // The A matrix
float * hostB; // The B matrix
float * hostC; // The output C matrix
float * deviceA;
float * deviceB;
float * deviceC;
int numARows; // number of rows in the matrix A
int numAColumns; // number of columns in the matrix A
int numBRows; // number of rows in the matrix B
int numBColumns; // number of columns in the matrix B
int numCRows; // number of rows in the matrix C (you have to set this)
int numCColumns; // number of columns in the matrix C (you have to set this)
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
//@@ Set numCRows and numCColumns
numCRows = 0;
numCColumns = 0;
numCRows = numARows;
numCColumns = numBColumns;
//@@ Allocate the hostC matrix
hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns);
wbTime_stop(Generic, "Importing data and creating memory on host");
wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);
wbTime_start(GPU, "Allocating GPU memory.");
//@@ Allocate GPU memory here
cudaMalloc((void**)&deviceA ,sizeof(float)*numARows*numAColumns );
cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns);
cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns);
wbTime_stop(GPU, "Allocating GPU memory.");
wbTime_start(GPU, "Copying input memory to the GPU.");
//@@ Copy memory to the GPU here
cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice);
cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice);
wbTime_stop(GPU, "Copying input memory to the GPU.");
//@@ Initialize the grid and block dimensions here
dim3 DimGrid((numCColumns - 1) / 8 + 1, (numCRows - 1) / 8 + 1, 1);
dim3 DimBlock(8 , 8, 1);
wbTime_start(Compute, "Performing CUDA computation");
//@@ Launch the GPU Kernel here
matrixMultiply<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns);
cudaThreadSynchronize();
wbTime_stop(Compute, "Performing CUDA computation");
wbTime_start(Copy, "Copying output memory to the CPU");
//@@ Copy the GPU memory back to the CPU here
cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost);
wbTime_stop(Copy, "Copying output memory to the CPU");
wbTime_start(GPU, "Freeing GPU Memory");
//@@ Free the GPU memory here
cudaFree(deviceA);
cudaFree(deviceB);
cudaFree(deviceC);
wbTime_stop(GPU, "Freeing GPU Memory");
wbSolution(args, hostC, numCRows, numCColumns);
free(hostA);
free(hostB);
free(hostC);
return 0;
}
答案 1 :(得分:3)
替换:
for (int k = 0 ; k < numAColumns ; ++k )
Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col];
C[Row*numAColumns + Col] = Cvalue;
}
与
for (int k = 0 ; k < numAColumns ; ++k )
Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col];
C[Row*numCColumns + Col] = Cvalue;
}
答案 2 :(得分:2)
替换:
Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col];
与
Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col];
答案 3 :(得分:2)
替换
Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col];
代表
Cvalue += A[Row*numAColumns +k]* B[k*numBColumns+Col];
和
C[Row*numAColumns + Col] = Cvalue;
的
C[Row*numCColumns+Col] = Cvalue;
答案 4 :(得分:1)
我们可以使用平铺矩阵乘法,我发现它有更好的执行时间。
#include <wb.h>
#define wbCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
return -1; \
} \
} while(0)
// Compute C = A * B
__global__ void matrixMultiplyShared(float * A, float * B, float * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns) {
//@@ Insert code to implement matrix multiplication here
//@@ You have to use shared memory for this MP
const int TILE_WIDTH = 32;
__shared__ float sharedA[TILE_WIDTH][TILE_WIDTH];
__shared__ float sharedB[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int Row = by*TILE_WIDTH + ty;
int Col = bx*TILE_WIDTH + tx;
float Cvalue = 0.0;
if (numAColumns != numBRows) return ;
for (int i = 0; i < (int)(ceil((float)numAColumns/TILE_WIDTH)); i++)
{
if (i*TILE_WIDTH + tx < numAColumns && Row < numARows){
sharedA[ty][tx] = A[Row*numAColumns + i*TILE_WIDTH + tx];
}else{
sharedA[ty][tx] = 0.0;
}
if (i*TILE_WIDTH + ty < numBRows && Col < numBColumns){
sharedB[ty][tx] = B[(i*TILE_WIDTH + ty)*numBColumns + Col];
}else{
sharedB[ty][tx] = 0.0;
}
__syncthreads();
if(Row < numARows && Col < numBColumns){
for(int j = 0; j < TILE_WIDTH; j++)
Cvalue += sharedA[ty][j] * sharedB[j][tx];
}
__syncthreads();
}
if (Row < numCRows && Col < numCColumns)
C[Row*numCColumns + Col] = Cvalue;
}
int main(int argc, char ** argv) {
wbArg_t args;
float * hostA; // The A matrix
float * hostB; // The B matrix
float * hostC; // The output C matrix
float * deviceA;
float * deviceB;
float * deviceC;
int numARows; // number of rows in the matrix A
int numAColumns; // number of columns in the matrix A
int numBRows; // number of rows in the matrix B
int numBColumns; // number of columns in the matrix B
int numCRows; // number of rows in the matrix C (you have to set this)
int numCColumns; // number of columns in the matrix C (you have to set this)
int TILE_WIDTH = 32;
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
//@@ Set numCRows and numCColumns
numCRows = 0;
numCColumns = 0;
numCRows = numARows;
numCColumns = numBColumns;
//@@ Allocate the hostC matrix
hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns);
wbTime_stop(Generic, "Importing data and creating memory on host");
wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);
wbTime_start(GPU, "Allocating GPU memory.");
//@@ Allocate GPU memory here
cudaMalloc((void**)&deviceA , sizeof(float)*numARows*numAColumns );
cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns);
cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns);
wbTime_stop(GPU, "Allocating GPU memory.");
wbTime_start(GPU, "Copying input memory to the GPU.");
//@@ Copy memory to the GPU here
cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice);
cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice);
wbTime_stop(GPU, "Copying input memory to the GPU.");
//@@ Initialize the grid and block dimensions here
int dimX = (int)(ceil((float)numCColumns / TILE_WIDTH));
int dimY = (int)(ceil((float)numCRows / TILE_WIDTH));
dim3 DimGrid(dimX, dimY);
dim3 DimBlock(TILE_WIDTH, TILE_WIDTH);
wbTime_start(Compute, "Performing CUDA computation");
//@@ Launch the GPU Kernel here
matrixMultiplyShared<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns);
cudaThreadSynchronize();
wbTime_stop(Compute, "Performing CUDA computation");
wbTime_start(Copy, "Copying output memory to the CPU");
//@@ Copy the GPU memory back to the CPU here
cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost);
wbTime_stop(Copy, "Copying output memory to the CPU");
wbTime_start(GPU, "Freeing GPU Memory");
//@@ Free the GPU memory here
cudaFree(deviceA);
cudaFree(deviceB);
cudaFree(deviceC);
wbTime_stop(GPU, "Freeing GPU Memory");
wbSolution(args, hostC, numCRows, numCColumns);
free(hostA);
free(hostB);
free(hostC);
return 0;
}