我正在尝试使用 CuSOLVER 来计算双精度线性方程组的解。我正在关注这里的方法声明:
https://docs.nvidia.com/cuda/cusolver/index.html#cusolverDN-lt-t-gt-gesv
通常,首先必须使用方法 cusolverDnDDgesv_bufferSize
找到工作区的最佳大小。可以用 cusolverDnDDgesv
求解线性系统。第一部分似乎有效,所以我可以无误地计算缓冲区大小。但是,当运行 cusolverDnDDgesv
时,我得到 Segmentation fault (core dumped)
。我可以使用 CuSOLVER 的其他功能,但在使用 cusolverDnDDgesv
时我无法弄清楚问题是什么。你能帮忙解决这个问题吗?
这是一个最小的例子。我用 nvcc cusolverDnZZgesv.cu -o prog.out -lcusolver
运行它。
#include <assert.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char *argv[]) {
cusolverDnHandle_t handle = NULL;
cusolverStatus_t cusolver_status;
cudaError_t cudaStat1;
cudaError_t cudaStat2;
cudaError_t cudaStat3;
// Set dimensions of A*x=b.
const int n = 3;
const int ldda = n;
const int lddb = n;
const int lddx = n;
const int nrhs = 1; // number of right hand side vectors
/* | 1 2 3 |
* A = | 4 5 6 |
* | 2 1 1 |
*
* x = (1 1 1)'
* b = (6 15 4)'
*/
// Matrix A.
double A[ldda * n] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0, 3.0, 6.0, 1.0};
// Vector b, right hand side of equation.
double B[lddb * nrhs] = {6.0, 15.0, 4.0};
// Solution of A*x=b.
double X[lddx * nrhs] = {1.0, 1.0, 1.0};
/* device memory */
double *d_A = NULL;
double *d_X = NULL;
double *d_B = NULL;
// Allocate memory on GPU.
cudaStat1 = cudaMalloc((void **)&d_A, sizeof(double) * ldda * n);
cudaStat2 = cudaMalloc((void **)&d_X, sizeof(double) * lddx * nrhs);
cudaStat3 = cudaMalloc((void **)&d_B, sizeof(double) * lddb * nrhs);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
// Copy to GPU.
cudaStat1 =
cudaMemcpy(d_A, A, sizeof(double) * ldda * n, cudaMemcpyHostToDevice);
cudaStat2 =
cudaMemcpy(d_B, B, sizeof(double) * lddb * nrhs, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
// int lwork = 0;
//
// cusolver_status =
// cusolverDnDgeqrf_bufferSize(handle, n, n, d_A, ldda, &lwork);
// ###########################################
// cusolverDnDDgesv_bufferSize
// ###########################################
int *dipiv = NULL;
int *dwork = NULL;
size_t lwork_bytes = 0;
cusolver_status = cusolverDnCreate(&handle);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
cusolver_status =
cusolverDnDDgesv_bufferSize(handle, n, nrhs, d_A, ldda, dipiv, d_B, lddb,
d_X, lddx, dwork, &lwork_bytes);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
// ###########################################
// cusolverDnDDgesv
// ###########################################
void *dWorkspace = NULL;
int *dinfo = NULL;
int niter;
cudaStat1 = cudaMalloc((void **)&dWorkspace, sizeof(double) * lwork_bytes);
cudaStat2 = cudaMalloc((void **)&dinfo, sizeof(int));
cudaStat3 = cudaMalloc((void **)&dipiv, sizeof(int) * n);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
cusolver_status =
cusolverDnDDgesv(handle, n, nrhs, d_A, ldda, dipiv, d_B, lddb, d_X, lddx,
dWorkspace, lwork_bytes, &niter, dinfo);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
// Write exact result to terminal.
std::cout << "Result:" << std::endl;
for (double x : X)
std::cout << x << std::endl;
// Copy result from GPU to CPU.
cudaStat1 =
cudaMemcpy(X, d_X, sizeof(double) * lddx * n, cudaMemcpyDeviceToHost);
// Write result from GPU to terminal.
std::cout << "Result from GPU:" << std::endl;
for (double x : X)
std::cout << x << std::endl;
}