乘以二维矩阵。与pycuda

时间:2013-01-12 16:44:28

标签: c++ python cuda python-2.7 pycuda

如何迭代两个数组?

__global__ void euclidean(float *x, float *y, int dim_x, int dim_y, int ms, float *solution) {

            int idx = threadIdx.x + blockDim.x * blockIdx.x;
            int idy = threadIdx.y + blockDim.y * blockIdx.y;

            float result = 0.0;

            for (int iter = 0; iter < ms; iter++) {

                float x_e = x[idy * ms + iter];
                float y_e = y[idx * ms + iter];

                result += (x_e * y_e);
            }
}

输入: X = [[1,2], [3,4], [5,6], [7,8], [9,10]]Y = [[0,0], [1,1]]

预期输出: [[0, 3], [0, 7], [0, 11], [0, 15]. [0, 19]]

我该怎么做?我的困难是迭代X和Y.

预期:

  

[idx:0 idy:0 = 0] [idx:1 idy:0 = 3] [idx:2 idy:0 = 0] [idx:3   idy:0 = 7] [idx:4 idy:0 = 0] [idx:0 idy:1 = 11] [idx:1 idy:1 =   0] [idx:2 idy:1 = 15] [idx:3 idy:1 = 0] [idx:4 idy:1 = 19]

1 个答案:

答案 0 :(得分:2)

我会做以下几个乘以2个矩阵。这可以处理边界条件,因此应该适用于任何网格/块大小。

// Compute C = A * B
__global__ void matrixMultiply(float * A, float * B, float * C,
                   int numARows, int numAColumns,
                   int numBRows, int numBColumns,
                   int numCRows, int numCColumns) {
    float cValue = 0;
    int Row = blockIdx.y * blockDim.y + threadIdx.y;
    int Col = blockIdx.x * blockDim.x + threadIdx.x;

    if ((Row < numCRows) && (Col < numCColumns)) {
        for (int k = 0; k < numAColumns; k++) {
            cValue += A[Row*numAColumns + k] * B[k*numBColumns + Col];
        }
        C[Row*numCColumns + Col] = cValue;
    }
}

如果您想要更高效的实施,您还可以使用共享内存:

// Compute C = A * B
__global__ void matrixMultiplyShared(float * A, float * B, float * C,
                     int numARows, int numAColumns,
                     int numBRows, int numBColumns,
                     int numCRows, int numCColumns) {
    __shared__ float ds_A[TILE_WIDTH_I][TILE_WIDTH_I];
    __shared__ float ds_B[TILE_WIDTH_I][TILE_WIDTH_I];

    int bx = blockIdx.x;
    int by = blockIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int Row = by * TILE_WIDTH + ty;
    int Col = bx * TILE_WIDTH + tx;
    float cValue = 0;

    for (int m = 0; m < (numAColumns/TILE_WIDTH); m++) {
        if (Row < numARows && m*TILE_WIDTH_I + tx < numAColumns) {
          ds_A[ty][tx] = A[Row*numAColumns + m*TILE_WIDTH_I + tx];
        } else {
         ds_A[ty][tx] = 0;
       }

        if (m*TILE_WIDTH_I + ty < numBRows && Col < numBColumns) {
          ds_B[ty][tx] = B[(m*TILE_WIDTH_I + ty)*numBColumns + Col];
       } else {
         ds_B[ty][tx] = 0;
       }

        __syncthreads();

        if ((Row < numCRows) && (Col < numCColumns)) {
            for (int k = 0; k < TILE_WIDTH; k++) {
                cValue += ds_A[ty][k] * ds_B[k][tx];
            }
        }

        __syncthreads();
    }

    if ((Row < numCRows) && (Col < numCColumns)) {
        C[Row*numCColumns + Col] = cValue;
    }
}

相关问题