如何使用 CuSOLVER 计算线性方程组的解

时间:2021-07-11 15:49:03

标签: cuda cusolver

我正在尝试使用 CuSOLVER 来计算双精度线性方程组的解。我正在关注这里的方法声明:

https://docs.nvidia.com/cuda/cusolver/index.html#cusolverDN-lt-t-gt-gesv

通常,首先必须使用方法 cusolverDnDDgesv_bufferSize 找到工作区的最佳大小。可以用 cusolverDnDDgesv 求解线性系统。第一部分似乎有效,所以我可以无误地计算缓冲区大小。但是,当运行 cusolverDnDDgesv 时,我得到 Segmentation fault (core dumped)。我可以使用 CuSOLVER 的其他功能,但在使用 cusolverDnDDgesv 时我无法弄清楚问题是什么。你能帮忙解决这个问题吗?

这是一个最小的例子。我用 nvcc cusolverDnZZgesv.cu -o prog.out -lcusolver 运行它。

#include <assert.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

int main(int argc, char *argv[]) {
  cusolverDnHandle_t handle = NULL;
  cusolverStatus_t cusolver_status;
  cudaError_t cudaStat1;
  cudaError_t cudaStat2;
  cudaError_t cudaStat3;

  // Set dimensions of A*x=b.
  const int n = 3;
  const int ldda = n;
  const int lddb = n;
  const int lddx = n;
  const int nrhs = 1; // number of right hand side vectors
  /*       | 1 2 3 |
   *   A = | 4 5 6 |
   *       | 2 1 1 |
   *
   *   x = (1 1 1)'
   *   b = (6 15 4)'
   */
  // Matrix A.
  double A[ldda * n] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0, 3.0, 6.0, 1.0};
  // Vector b, right hand side of equation.
  double B[lddb * nrhs] = {6.0, 15.0, 4.0};
  // Solution of A*x=b.
  double X[lddx * nrhs] = {1.0, 1.0, 1.0};

  /* device memory */
  double *d_A = NULL;
  double *d_X = NULL;
  double *d_B = NULL;

  // Allocate memory on GPU.
  cudaStat1 = cudaMalloc((void **)&d_A, sizeof(double) * ldda * n);
  cudaStat2 = cudaMalloc((void **)&d_X, sizeof(double) * lddx * nrhs);
  cudaStat3 = cudaMalloc((void **)&d_B, sizeof(double) * lddb * nrhs);
  assert(cudaSuccess == cudaStat1);
  assert(cudaSuccess == cudaStat2);
  assert(cudaSuccess == cudaStat3);

  // Copy to GPU.
  cudaStat1 =
      cudaMemcpy(d_A, A, sizeof(double) * ldda * n, cudaMemcpyHostToDevice);
  cudaStat2 =
      cudaMemcpy(d_B, B, sizeof(double) * lddb * nrhs, cudaMemcpyHostToDevice);
  assert(cudaSuccess == cudaStat1);
  assert(cudaSuccess == cudaStat2);

  // int lwork = 0;
  //
  //   cusolver_status =
  //       cusolverDnDgeqrf_bufferSize(handle, n, n, d_A, ldda, &lwork);

  // ###########################################
  // cusolverDnDDgesv_bufferSize
  // ###########################################
  int *dipiv = NULL;
  int *dwork = NULL;
  size_t lwork_bytes = 0;

  cusolver_status = cusolverDnCreate(&handle);
  assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);

  cusolver_status =
      cusolverDnDDgesv_bufferSize(handle, n, nrhs, d_A, ldda, dipiv, d_B, lddb,
                                  d_X, lddx, dwork, &lwork_bytes);
  assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);

  // ###########################################
  // cusolverDnDDgesv
  // ###########################################
  void *dWorkspace = NULL;
  int *dinfo = NULL;
  int niter;

  cudaStat1 = cudaMalloc((void **)&dWorkspace, sizeof(double) * lwork_bytes);
  cudaStat2 = cudaMalloc((void **)&dinfo, sizeof(int));
  cudaStat3 = cudaMalloc((void **)&dipiv, sizeof(int) * n);
  assert(cudaSuccess == cudaStat1);
  assert(cudaSuccess == cudaStat2);
  assert(cudaSuccess == cudaStat3);

  cusolver_status =
      cusolverDnDDgesv(handle, n, nrhs, d_A, ldda, dipiv, d_B, lddb, d_X, lddx,
                       dWorkspace, lwork_bytes, &niter, dinfo);
  assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);

  // Write exact result to terminal.
  std::cout << "Result:" << std::endl;
  for (double x : X)
    std::cout << x << std::endl;

  // Copy result from GPU to CPU.
  cudaStat1 =
      cudaMemcpy(X, d_X, sizeof(double) * lddx * n, cudaMemcpyDeviceToHost);

  // Write result from GPU to terminal.
  std::cout << "Result from GPU:" << std::endl;
  for (double x : X)
    std::cout << x << std::endl;
}

0 个答案:

没有答案