CUDA:cuSolver引发异常

时间:2015-11-22 19:43:25

标签: cuda linear-equation cusolver

我正在尝试使用cusolver库来解决许多线性方程,但是引发了异常,这很奇怪。 代码只使用库中的一个函数,其余的是内存分配和内存复制。 功能是

cusolverSpScsrlsvcholHost(
   cusolverSpHandle_t handle, int m, int nnz,
   const cusparseMatDescr_t descrA, const float *csrVal,
   const int *csrRowPtr, const int *csrColInd, const float *b,
   float tol, int reorder, float *x, int *singularity); 

我认为我的问题可能在于 - 重新排序 - 奇点参数,其余的是矩阵参数 这是代码:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <cusparse.h>
#include <cublas_v2.h>
#include <stdio.h>
#include <cusolverSp.h>

int main()
{
    //initialize our test cases
    const int size = 3;
    int nnz = 6 ;
    int sing = -1 ;

    //float values[] = {0,0,0,0} ;
    float values[] = {1,2,3,4,5,6} ;
    int colIdx[] = {0,0,1,0,1,2};
    int rowPtr[] = {0, 1,3,7};

    float x[] = {4,-6,7};
    float y[3]= {0,0,0} ;

    float *dev_values = 0 ;
    int *dev_rowPtr = 0 ;
    int *dev_colIdx = 0 ;
    float *dev_x = 0 ;
    float *dev_y = 0 ;

    cusolverSpHandle_t solver_handle ;
    cusolverSpCreate(&solver_handle) ;

    cusparseMatDescr_t descr = 0;

    cusparseCreateMatDescr(&descr);
    cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaSetDevice(0);

    cudaEvent_t start, stop;
    float time;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaMalloc((void**)&dev_x, size * sizeof(float));
    cudaMalloc((void**)&dev_y, size * sizeof(float));
    cudaMalloc((void**)&dev_values, nnz * sizeof(float));
    cudaMalloc((void**)&dev_rowPtr, (size + 1) * sizeof(int));
    cudaMalloc((void**)&dev_colIdx, nnz * sizeof(int));

    //Memcpy
    cudaMemcpyAsync(dev_x, x, size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpyAsync(dev_values, values, nnz * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpyAsync(dev_rowPtr, rowPtr, (size + 1) * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpyAsync(dev_colIdx, colIdx, nnz * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpyAsync(dev_y, y, size * sizeof(float), cudaMemcpyHostToDevice);

    cusolverSpScsrlsvluHost(solver_handle, size, nnz, descr, dev_values, dev_rowPtr, dev_colIdx,     dev_y, 0,0, dev_x, &sing);


    cudaMemcpyAsync(y, dev_y, size*sizeof(float), cudaMemcpyDeviceToHost );

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf ("Time for the kernel: %f ms\n", time);

    printf("%f\n",y[0]);
    printf("%f\n",y[1]);
    printf("%f\n",y[2]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.

    cudaDeviceReset();
    cudaFree(dev_x);
    cudaFree(dev_y);
    cudaFree(dev_values);
    cudaFree(dev_rowPtr);
    cudaFree(dev_colIdx);
    return 1;
}

1 个答案:

答案 0 :(得分:2)

您的代码中至少有3个问题:

  1. 您正在使用该功能的主机变体:cusolverSpScsrlsvlu 主机 ()。如果您查看cusolverSpScsrlsvluHost的{​​{3}},您会发现对于主机MemSpace ,该函数需要所有参数和指针参数都是基于主机的。但是您正在将设备指针传递给该函数。这样你就会得到段错误。对于dev_values之类的所有参数,您需要替换具有等效主机数据指针的参数(例如values代替dev_values)。

  2. 您的CSR矩阵格式不正确。这一行:

    int rowPtr[] = {0, 1,3,7};
    

    应该是这样的:

    int rowPtr[] = {0, 1,3,6};
    

    指向一个元素超过最后一个元素的正确行指针索引是6而不是7,因为6个实际元素编号为0..5。此问题还可能导致段错误。

  3. 您将yx错误地(反向)传递给cusolverSpScsrlsvluHost()。由于您已在x中放入非零值,因此您可能希望将其作为RHS向量。此向量在文档中使用名称b,它是第一个要传递的向量。那么你的y向量可能是解决方案向量,它是按参数顺序传递的最后一个向量(它在文档中采用名称x)。

    < / LI>
  4. 我建议使用正确的错误检查。

  5. 以下代码解决了上述问题,并产生了明智的结果:

    $ cat t979.cu
    #include <cusparse.h>
    #include <stdio.h>
    #include <cusolverSp.h>
    #include <assert.h>
    
    int main()
    {
        //initialize our test cases
        const int size = 3;
        const int nnz = 6 ;
        int sing = 0;
    
        //float values[] = {0,0,0,0} ;
        float values[nnz] = {1,2,3,4,5,6} ;
        int colIdx[nnz] = {0,0,1,0,1,2};
        int rowPtr[size+1] = {0, 1,3,6};
    
        float x[size] = {4,-6,7};
        float y[size]= {0,0,0} ;
        cusolverStatus_t cso;
        cusolverSpHandle_t solver_handle ;
        cso = cusolverSpCreate(&solver_handle) ;
        assert(cso == CUSOLVER_STATUS_SUCCESS);
        cusparseStatus_t csp;
        cusparseMatDescr_t descr = 0;
    
        csp = cusparseCreateMatDescr(&descr);
        assert(csp == CUSPARSE_STATUS_SUCCESS);
        csp = cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
        assert(csp == CUSPARSE_STATUS_SUCCESS);
        csp = cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
        assert(csp == CUSPARSE_STATUS_SUCCESS);
        cso = cusolverSpScsrlsvluHost(solver_handle, size, nnz, descr, values, rowPtr, colIdx, x, 0.0,0, y, &sing);
        assert(cso == CUSOLVER_STATUS_SUCCESS);
        printf("%f\n",y[0]);
        printf("%f\n",y[1]);
        printf("%f\n",y[2]);
    
        return 0;
    }
    $ nvcc -o t979 t979.cu -lcusolver -lcusparse
    $ ./t979
    4.000000
    -4.666667
    2.388889
    $
    

    另请注意,有一个完全有效的CUDA the documentation,可以证明此功能的正确使用。