
时间:2015-04-11 15:57:09

标签: c++ c matrix cuda


我们说我有一个MxN矩阵和一个长度为N的向量。我想对矩阵的每一行求和(但可以是任何其他数学运算)。 这种操作的序列代码是:

for (int c = 0; c < columns; c++) 
    for (int r = 0; r < rows; r++)
        M[r * rows + c] += V[c];


__global__ void kernel(const unsigned int size, float* matrix, const float* vector)
    // get the current element index for the thread
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size)
        // sum the current element with the 
        matrix[idx] += vector[threadIdx.x];




int block_size = 64;
int grid_size = (M * N + block_size - 1) / block_size;
kernel<<<grid_size, block_size>>>(M * N, matrix, vector);



M[r * columns + c] += V[c];


2 个答案:

答案 0 :(得分:2)


该方法的主要限制是可以处理的最大向量长度和因此矩阵宽度等于每个块的最大线程数,在当前支持CUDA 7的GPU上为1024.


编辑:根据讨论/评论,OP想知道如何处理行主要或列主要底层存储。以下示例使用模板化内核来选择行主要或列主要底层存储,并且还显示了一种可能的CUBLAS方法,用于使用rank-1 update function执行向每个矩阵行添加操作:

$ cat t712.cu
#include <iostream>
#include <cublas_v2.h>

#define ROWS 20
#define COLS 10

#define nTPB 64

#define ROW_MAJOR 0
#define COL_MAJOR 1

template <int select, typename T>
__global__ void vec_mat_row_add(const unsigned int height, const unsigned int width, T* matrix, const T* vector)
    // get the current element index for the thread
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < height*width)
        // sum the current element with the
    if (select == ROW_MAJOR)
        matrix[idx] += vector[idx%width];
    else // COL_MAJOR
        matrix[idx] += vector[idx/height];

int main(){

  float *h_mat, *d_mat, *h_vec, *d_vec;
  const unsigned int msz = ROWS*COLS*sizeof(float);
  const unsigned int vsz = COLS*sizeof(float);
  h_mat = (float *)malloc(msz);
  h_vec = (float *)malloc(vsz);
  cudaMalloc(&d_mat, msz);
  cudaMalloc(&d_vec, vsz);
  for (int i=0; i<COLS; i++) h_vec[i] = i; // set vector to 0,1,2, ...
  cudaMemcpy(d_vec, h_vec, vsz, cudaMemcpyHostToDevice);
  // test row-major case
  cudaMemset(d_mat, 0, msz); // set matrix to zero
  vec_mat_row_add<ROW_MAJOR><<<(ROWS*COLS + nTPB -1)/nTPB, nTPB>>>(ROWS, COLS, d_mat, d_vec);
  cudaMemcpy(h_mat, d_mat, msz, cudaMemcpyDeviceToHost);
  std::cout << "Row-major result: " << std::endl;
  for (int i = 0; i < ROWS; i++){
    for (int j = 0; j < COLS; j++) std::cout << h_mat[i*COLS+j] << " ";
    std::cout << std::endl;}
  // test column-major case
  cudaMemset(d_mat, 0, msz); // set matrix to zero
  vec_mat_row_add<COL_MAJOR><<<(ROWS*COLS + nTPB -1)/nTPB, nTPB>>>(ROWS, COLS, d_mat, d_vec);
  cudaMemcpy(h_mat, d_mat, msz, cudaMemcpyDeviceToHost);
  std::cout << "Column-major result: " << std::endl;
  for (int i = 0; i < ROWS; i++){
    for (int j = 0; j < COLS; j++) std::cout << h_mat[j*ROWS+i] << " ";
    std::cout << std::endl;}
  // test CUBLAS, doing matrix-vector add using <T>ger
  cudaMemset(d_mat, 0, msz); // set matrix to zero
  float *d_ones, *h_ones;
  h_ones = (float *)malloc(ROWS*sizeof(float));
  for (int i =0; i<ROWS; i++) h_ones[i] = 1.0f;
  cudaMalloc(&d_ones, ROWS*sizeof(float));
  cudaMemcpy(d_ones, h_ones, ROWS*sizeof(float), cudaMemcpyHostToDevice);
  cublasHandle_t ch;
  float alpha = 1.0f;
  cublasStatus_t stat = cublasSger(ch, ROWS, COLS, &alpha, d_ones, 1, d_vec, 1, d_mat, ROWS);
  if (stat != CUBLAS_STATUS_SUCCESS) {std::cout << "CUBLAS error: " << (int)stat << std::endl; return 1;}
  cudaMemcpy(h_mat, d_mat, msz, cudaMemcpyDeviceToHost);
  std::cout << "CUBLAS Column-major result: " << std::endl;
  for (int i = 0; i < ROWS; i++){
    for (int j = 0; j < COLS; j++) std::cout << h_mat[j*ROWS+i] << " ";
    std::cout << std::endl;}

  return 0;
$ nvcc -o t712 t712.cu -lcublas
$ ./t712
Row-major result:
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
Column-major result:
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
CUBLAS Column-major result:
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9

为简洁起见,我还没有包含proper cuda error checking,但每次遇到CUDA代码时,这都是个好主意。作为代理/快捷方式,您可以使用cuda-memcheck运行代码作为快速检查以查看是否存在任何CUDA错误。


答案 1 :(得分:1)

Robert Crovella已经回答了这个问题,提供了使用显式CUDA内核和cuBLAS的示例。

我发现,对于将来的参考,我还发现了一个有关如何使用CUDA Thrust执行逐行或逐列操作的示例。特别是,我关注两个问题:

  1. 将列向量求和到所有矩阵列;
  2. 将行向量求和到所有矩阵行。
  3. thrust::transform的一般性能够将下面的例子概括为除和之外的元素运算(例如,乘法,除法,减法等)。

    #include <thrust/device_vector.h>
    #include <thrust/reduce.h>
    #include <thrust/random.h>
    #include <thrust/sort.h>
    #include <thrust/unique.h>
    #include <thrust/equal.h>
    using namespace thrust::placeholders;
    template <typename T>
    struct linear_index_to_row_index : public thrust::unary_function<T,T> {
        T Ncols; // --- Number of columns
        __host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {}
        __host__ __device__ T operator()(T i) { return i / Ncols; }
    /* MAIN */
    int main()
        const int Nrows = 10;           // --- Number of rows
        const int Ncols =  3;           // --- Number of columns  
        // --- Random uniform integer distribution between 0 and 100
        thrust::default_random_engine rng;
        thrust::uniform_int_distribution<int> dist1(0, 100);
        // --- Random uniform integer distribution between 1 and 4
        thrust::uniform_int_distribution<int> dist2(1, 4);
        // --- Matrix allocation and initialization
        thrust::device_vector<float> d_matrix(Nrows * Ncols);
        for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (float)dist1(rng);
        // --- Column vector allocation and initialization
        thrust::device_vector<float> d_column(Nrows);
        for (size_t i = 0; i < d_column.size(); i++) d_column[i] = (float)dist2(rng);
        // --- Row vector allocation and initialization
        thrust::device_vector<float> d_row(Ncols);
        for (size_t i = 0; i < d_row.size(); i++) d_row[i] = (float)dist2(rng);
        printf("\n\nOriginal matrix\n");
        for(int i = 0; i < Nrows; i++) {
            std::cout << "[ ";
            for(int j = 0; j < Ncols; j++)
                std::cout << d_matrix[i * Ncols + j] << " ";
            std::cout << "]\n";
        printf("\n\nColumn vector\n");
        for(int i = 0; i < Nrows; i++) std::cout << d_column[i] << "\n";
        printf("\n\nRow vector\n");
        for(int i = 0; i < Ncols; i++) std::cout << d_row[i] << " ";
        thrust::device_vector<float> d_matrix2(d_matrix);
        thrust::transform(d_matrix.begin(), d_matrix.end(),
                                    thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Ncols))),
        printf("\n\nColumn + Matrix -> Result matrix\n");
        for(int i = 0; i < Nrows; i++) {
            std::cout << "[ ";
            for(int j = 0; j < Ncols; j++)
                std::cout << d_matrix2[i * Ncols + j] << " ";
            std::cout << "]\n";
        thrust::device_vector<float> d_matrix3(d_matrix);
                                    thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)), 
                                    thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)) + Nrows * Ncols,                    
                                        thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows))),
                                    thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)), 
        printf("\n\nRow + Matrix -> Result matrix\n");
        for(int i = 0; i < Nrows; i++) {
            std::cout << "[ ";
            for(int j = 0; j < Ncols; j++)
                std::cout << d_matrix3[i * Ncols + j] << " ";
            std::cout << "]\n";
        return 0; 