使用CUDA的cusolverSpDcsrlsvlu或QR方法

时间:2015-05-05 18:00:22

标签: cuda cusolver

我搜遍了全世界但无法解决这个问题! “cusolver test.exe中0x00007FFF3AD3D430(cusolver64_70.dll)的未处理异常:0xC0000005:访问冲突读取位置0x0000000400960004。” 我想用最小二乘解算器或Qr方法解决Ax = B ....我的代码编译没有错误但后来我得到这个错误! 代码在此代码的最后一行中断!  我的代码是

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cublas.h>
#include <cusolver_common.h>
#include <cusolverSp.h>
#include <cusparse.h>
#include "device_launch_parameters.h"
#include <cuda_runtime.h>

# include <memory.h>
# include <thrust/device_vector.h>
# include <thrust/host_vector.h>
# include <thrust/device_ptr.h>
# include <thrust/system/cuda/execution_policy.h>



double *dX, *X;
double *dY, *Y;

int NoOfBuses = 4;


void main(void)
{
cusparseStatus_t status;
int row;
double *matA, *d_matA;
size_t pitchd_matA;
int *dNnzPerRow;
double *dCsrValA, *H_CsrVal;
int *dCsrRowPtrA, *HCsrRowPtrA;
int *dCsrColIndA, *HCsrColIndA;
int totalNnz;

cusparseHandle_t handle = 0;
cusparseMatDescr_t descr = 0;


//---------------------------------------------------------------------------------------------------------------

matA = (double *)calloc(((NoOfBuses+1)*(NoOfBuses+1)), sizeof(double));
Y = (double *)calloc((NoOfBuses + 1), sizeof(double));
X = (double *)calloc((NoOfBuses + 1), sizeof(double));

//STTORING IN col MAJOR FORM
for (int Row = 1; Row <= NoOfBuses; Row++)
{
    double value = 1;

    for (int Col = 1; Col <= NoOfBuses; Col++)
    {
        matA[Row + Col*(NoOfBuses + 1)] = value;
        value++;
    }
}


double value = 1;

for (int index = 1; index <= NoOfBuses; index++)
{
    Y[index] = value;
    value++;
}

printf("\n");
printf("A matrix\n");
for (int Row = 0; Row <= NoOfBuses; Row++)
{
    for (int Col = 0; Col <= NoOfBuses; Col++)
    {
        printf("%f\t",matA[Col + Row*(NoOfBuses + 1)] );
    }
    printf("\n");
}

printf("Y matrix\n\n");
for (int index = 0; index <= NoOfBuses; index++)
{
    printf("%f\n",Y[index]);
}

//-------------------------------------------------------------------------------------------------------

 cusparseCreate(&handle);

 // Allocate device memory to store the sparse CSR representation of A
 cudaMalloc((void **)&dCsrValA, sizeof(double)* 16 );
 cudaMalloc((void **)&dCsrColIndA, sizeof(int)* 16);
 cudaMalloc((void **)&dCsrRowPtrA, sizeof(int)* (NoOfBuses + 2));


// Allocate device memory for vectors and the dense form of the matrix A 

//cudaMallocPitch((void **) &d_matA, &pitchd_matA, sizeof(double)*(NoOfBuses + 1), (NoOfBuses + 1));
cudaMalloc((void **)&d_matA, sizeof(double)* (NoOfBuses + 1)*(NoOfBuses + 1));
cudaMalloc((void **)&dNnzPerRow, sizeof(int)* NoOfBuses);


//transfer Matrix A,X & Y to the GPU
cudaMemcpy(dX, X, sizeof(double) * (NoOfBuses +1), cudaMemcpyHostToDevice);
cudaMemcpy(dY, Y, sizeof(double)* (NoOfBuses + 1), cudaMemcpyHostToDevice);
//cudaMemcpy2D(d_matA, pitchd_matA, matA, sizeof(double)*(NoOfBuses + 1), sizeof(double)*(NoOfBuses + 1), (NoOfBuses + 1), cudaMemcpyHostToDevice);
cudaMemcpy(d_matA,matA,sizeof(double) * (NoOfBuses + 1)*(NoOfBuses + 1), cudaMemcpyHostToDevice);


// Construct a descriptor of the matrix A
 cusparseCreateMatDescr(&descr);
 cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
 cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
 cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_NON_UNIT);
 //cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);

cusparseDnnz(handle, CUSPARSE_DIRECTION_COLUMN, (NoOfBuses+1), (NoOfBuses+1), descr, d_matA, (NoOfBuses+1), dNnzPerRow, &totalNnz);

printf("the total number of Non zero elements are = %d",totalNnz);




cusparseDdense2csr(handle,  (NoOfBuses+1), (NoOfBuses+1), descr, d_matA, (NoOfBuses+1), dNnzPerRow, dCsrValA, dCsrRowPtrA, dCsrColIndA);


cudaMalloc((void **)&dX, sizeof(double)* (NoOfBuses + 1));
cudaMalloc((void **)&dY, sizeof(double)* (NoOfBuses + 1));
H_CsrVal = (double *)calloc((totalNnz), sizeof(double));
HCsrRowPtrA = (int *)calloc((NoOfBuses + 2), sizeof(int));
HCsrColIndA = (int *)calloc((totalNnz), sizeof(int));


cudaMemcpy(H_CsrVal, dCsrValA, sizeof(double) * (totalNnz), cudaMemcpyDeviceToHost);
cudaMemcpy(HCsrRowPtrA, dCsrRowPtrA, sizeof(int)* (NoOfBuses + 2), cudaMemcpyDeviceToHost);
cudaMemcpy(HCsrColIndA, dCsrColIndA, sizeof(int)* (totalNnz), cudaMemcpyDeviceToHost);

printf("Values\n\n");
for (int index = 0; index <= (totalNnz-1); index++)
{
    printf("%f\n", H_CsrVal[index]);
}

printf("\ncol pointer matrix\n\n");
for (int index = 0; index <= (totalNnz - 1); index++)
{
    printf("%d\n", HCsrColIndA[index]);
}

printf("\nrow ofssett pointer matrix\n\n");
for (int index = 0; index <= (NoOfBuses +2 - 1); index++)
{
    printf("%d\n", HCsrRowPtrA[index]);
}


//------------------------------------------------------------------------------------

cusolverSpHandle_t handleSolver;
double tol = 0.0000001;
int reorder = 0;
int valuefor,*singularity = &valuefor;
*singularity = 0;
cudaStream_t streamId = NULL;
cusolverStatus_t Checker;

Checker=cusolverSpCreate(&handleSolver);
cusolverStatus_t cudasu = cusolverSpSetStream(handleSolver, streamId);
cusolverStatus_t pakao = cusolverSpDcsrlsvluHost(handleSolver,5, totalNnz, descr, dCsrValA, dCsrRowPtrA, dCsrColIndA, dY, tol, reorder, dX, singularity);


getchar();
}

1 个答案:

答案 0 :(得分:1)

您正在使用API​​的Host版本,但您正在将设备变量传递给它:

cudaMalloc((void **)&dCsrValA, sizeof(double)* 16 );
...

cusolverStatus_t pakao = cusolverSpDcsrlsvluHost(handleSolver,5, totalNnz, descr, dCsrValA, dCsrRowPtrA, dCsrColIndA, dY, tol, reorder, dX, singularity);
                                            ^^^^                                  ^^

参考cusolver documentation

enter image description here

我们看到,对于主机路径,所有变量必须位于主机上,而不是设备上。