Question

因此，在最终让我的GPU集成器正常工作之后，我发现如果我增加系统的维度，我正在尝试集成，.exe编译并且看起来它正在运行，但随后一直“停止工作。“

潜在的问题 - 程序使用一个运行积分器的主线程并循环我正在积分的点数。我认为这是主线程，它必须运行整个集成的长度（在我的python / pyCUDA版本的代码通常需要几个小时）才会导致问题。

我考虑过的另一个潜在问题是，更改系统的维度会直接影响内核调用中启动的线程数。 100个工作，但200导致.exe已停止工作错误。我在GTX Titan上运行，但是我知道每块可以启动多达1024个线程，所以我认为这不是问题。

潜在的解决方案 - 现在我已经了解了超时检测和恢复的问题。 http://msdn.microsoft.com/en-us/windows/hardware/gg487368.aspx我遇到了此错误，并使用此处记录的方法：http://http.developer.nvidia.com/ParallelNsight/2.1/Documentation/UserGuide/HTML/Content/Using_CUDA_Debugger.htm使用NSIGHT Monitor关闭WDDM。我不再收到特定的“驱动程序已停止响应并已重置”错误。

不会抛出CUDA错误消息。出现错误后按调试，我得到了

"Unhandled exception at 0x0000000013F07B0A7 in Dynamic Parallelism Test.exe: 0xC00000FD: Stack Overflow : (parameters: 0x0000000000000001, 0x0000000000193000)."

抱歉，不确定0的数字。

Google搜索我们同名网站http://en.wikipedia.org/wiki/Stack_overflow的实际含义，这是否表明我的内核尝试使用的内存会发生奇怪的事情......

修改

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
//#include <stdio.h>
#include <iostream>
#include <fstream>
#include <iomanip>                      //display 2 decimal places
#include <math.h>
using namespace std;

__global__ void rkf5(size_t, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, int*, int*, int*, int*, double*, double*, double*, double*, double*, double*, double* , double*);
__global__ void calcK(int*, int*, int*, double*, double*, double*, double*, double*, double*, double*, double*, double*);
__global__ void k1(double*, double*, double*);
__global__ void k2(double*, double*, double*);
__global__ void k3(double*, double*, double*);
__global__ void k4(double*, double*, double*);
__global__ void k5(double*, double*, double*);
__global__ void k6(double*, double*, double*);
__global__ void arrAdd(double*, double*, double*);
__global__ void arrSub(double*, double*, double*);
__global__ void arrMult(double*, double*, double*);
__global__ void arrInit(double*, double);
__global__ void arrCopy(double*, double*);
__device__ void setup(double , double*, double*, double*, double*, int*);
__device__ double flux(int, double*) ;
__device__ double knowles_flux(int, int*, int*, int*, double*, double*, double*, double*, double*, double*, double*);
__device__ void calcStepSize(double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, int*);
__global__ void storeConcs(double*, size_t, double*, int);
__global__ void takeFourthOrderStep(double*, double*, double*, double*, double*, double*, double*);
__global__ void takeFifthOrderStep(double*, double*, double*, double*, double*, double*, double*, double*);

//Error checking that I don't understand yet.
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

//Main program.
int main(int argc, char** argv)
{
    //std::cout << std::fixed;              //display 2 decimal places
    //std::cout << std::setprecision(8);        //display 2 decimal places

    const int maxlength = 125;              //Number of discrete concentrations we are tracking.
    int nc = 2;                             //Nucleus Size
    int n2 = 0;                             //Secondary Nucleus Size
    double ka = 5E4;        //Monomer addition rate
    double kb = 0;      //Monomer subtraction rate
    double kp = 0;      //Oligomer addition rate
    double km = 2E-8;       //Oligomer subtraction rate
    double kn = 2E-5;       //Nucleation rate
    double kn2 = 0; //Secondary nucleation rate
    double mo = 5E-6;                           //Initial concentration in M

    double concs[maxlength];                //Meant to store the current concentrations 
    double temp1[maxlength];                //Used as a bin to store products of Butcher's tableau and k values.
    double temp2[maxlength];                //Used as a bin to store products of Butcher's tableau and k values.
    double tempsum[maxlength];              //Used as a bin to store cumulative sum of tableau and k values
    double k1s[maxlength];
    double k2s[maxlength];
    double k3s[maxlength];
    double k4s[maxlength];
    double k5s[maxlength];
    double k6s[maxlength];
    const int numpoints = 1000;     
    double to = 0;                          //Beginning integration time in seconds
    double tf = 5;                          //Final integration time in seconds
    double dt = (tf-to)/static_cast<double>(numpoints); //Static step size in seconds
    double concStorage[maxlength][numpoints];   //Stores concs [rows] vs. time [columns]

    //Initialize all the arrays on the host to ensure arrays of 0's are sent to the device.
    //Also, here is where we can seed the system.
    std::cout<<dt;
    std::cout<<"\n";
    concs[0]=mo;
    std::cout<<concs[0];
    std::cout<<" ";
    for (int i=0; i<maxlength; i++)
    {
        for (int j=0; j<numpoints; j++)
            concStorage[i][j]=0;
        concs[i]=0;
        temp1[i]=0;
        temp2[i]=0;
        tempsum[i]=0;
        k1s[i]=0;
        k2s[i]=0;
        k3s[i]=0;
        k4s[i]=0;
        k5s[i]=0;
        k6s[i]=0;
        //std::cout<<concs[i];
        //std::cout<<" ";
    }
    concs[0]=mo;
    std::cout<<"\n";

    //Define all the pointers to device array memory addresses. These contain the on-GPU
    //addresses of all the data we're generating/using.
    double *d_concStorage;
    double *d_temp1;
    double *d_temp2;
    double *d_tempsum;
    double *d_k1s;
    double *d_k2s;
    double *d_k3s;
    double *d_k4s;
    double *d_k5s;
    double *d_k6s;
    int *d_numpoints;
    int *d_maxlength;
    int *d_nc;              
    int *d_n2;
    double *d_ka;       
    double *d_kb;       
    double *d_kp;       
    double *d_km;       
    double *d_kn;   
    double *d_kn2;
    double *d_concs;

    double *d_dt;
    double *d_to;
    double *d_tf;


    //Calculate all the sizes of the arrays in order to allocate the proper amount of memory on the GPU.
    //A lot of these can be simplified to "sizeof(double)" etc
    size_t size_temp1 = sizeof(temp1);
    size_t size_temp2 = sizeof(temp2);
    size_t size_tempsum = sizeof(tempsum);
    size_t size_ks = sizeof(k1s);
    size_t size_numpoints = sizeof(numpoints);
    size_t size_maxlength = sizeof(maxlength);
    size_t size_nc = sizeof(nc);
    size_t size_n2 = sizeof(n2);
    size_t size_ka = sizeof(ka);
    size_t size_kb = sizeof(kb);
    size_t size_kp = sizeof(kp);
    size_t size_km = sizeof(km);
    size_t size_kn = sizeof(kn);
    size_t size_kn2 = sizeof(kn2);
    size_t size_concs = sizeof(concs);

    size_t size_dt = sizeof(dt);
    size_t size_to = sizeof(to);
    size_t size_tf = sizeof(tf);
    size_t h_pitch = numpoints*sizeof(double);
    size_t d_pitch;

    //Calculate the "pitch" of the 2D array.  The pitch is basically the length of a 2D array's row.  IT's larger 
    //than the actual row full of data due to hadware issues.  We thusly will use the pitch instead of the data 
    //size to traverse the array.
    gpuErrchk(cudaMallocPitch( (void**)&d_concStorage, &d_pitch, numpoints * sizeof(double), maxlength)); 

    //Allocate memory on the GPU for all the arrrays we're going to use in the integrator.

    gpuErrchk(cudaMalloc((void**)&d_temp1, size_temp1));
    gpuErrchk(cudaMalloc((void**)&d_temp2, size_temp1));
    gpuErrchk(cudaMalloc((void**)&d_tempsum, size_tempsum));
    gpuErrchk(cudaMalloc((void**)&d_k1s, size_ks));
    gpuErrchk(cudaMalloc((void**)&d_k2s, size_ks));
    gpuErrchk(cudaMalloc((void**)&d_k3s, size_ks));
    gpuErrchk(cudaMalloc((void**)&d_k4s, size_ks));
    gpuErrchk(cudaMalloc((void**)&d_k5s, size_ks));
    gpuErrchk(cudaMalloc((void**)&d_k6s, size_ks));
    gpuErrchk(cudaMalloc((void**)&d_numpoints, size_numpoints));
    gpuErrchk(cudaMalloc((void**)&d_maxlength, size_maxlength));
    gpuErrchk(cudaMalloc((void**)&d_nc, size_nc));
    gpuErrchk(cudaMalloc((void**)&d_n2, size_n2));
    gpuErrchk(cudaMalloc((void**)&d_ka, size_ka));
    gpuErrchk(cudaMalloc((void**)&d_kb, size_kb));
    gpuErrchk(cudaMalloc((void**)&d_kp, size_kp));
    gpuErrchk(cudaMalloc((void**)&d_km, size_km));
    gpuErrchk(cudaMalloc((void**)&d_kn, size_kn));
    gpuErrchk(cudaMalloc((void**)&d_kn2, size_kn2));
    gpuErrchk(cudaMalloc((void**)&d_concs, size_concs));

    gpuErrchk(cudaMalloc((void**)&d_dt, size_dt));
    gpuErrchk(cudaMalloc((void**)&d_to, size_to));
    gpuErrchk(cudaMalloc((void**)&d_tf, size_tf));

    //Copy all initial values of arrays to GPU.
    gpuErrchk(cudaMemcpy2D(d_concStorage, d_pitch, concStorage, h_pitch, numpoints*sizeof(double), maxlength, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_temp1, &temp1, size_temp1, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_temp2, &temp2, size_temp2, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_tempsum, &tempsum, size_tempsum, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_k1s, &k1s, size_ks, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_k2s, &k2s, size_ks, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_k3s, &k3s, size_ks, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_k4s, &k4s, size_ks, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_k5s, &k5s, size_ks, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_k6s, &k6s, size_ks, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_numpoints, &numpoints, size_numpoints, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_maxlength, &maxlength, size_maxlength, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_nc, &nc, size_nc, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_n2, &n2, size_n2, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_ka, &ka, size_ka, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_kb, &kb, size_kb, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_kp, &kp, size_kp, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_km, &km, size_km, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_kn, &kn, size_kn, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_kn2, &kn2, size_kn2, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_concs, &concs, size_concs, cudaMemcpyHostToDevice));

    gpuErrchk(cudaMemcpy(d_dt, &dt, size_dt, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_to, &to, size_to, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_tf, &tf, size_tf, cudaMemcpyHostToDevice));

    //Run the integrator.
    //gpuErrchk(cudaSetDevice(1));
    rkf5<<<1,1>>>(d_pitch, d_concStorage, d_temp1, d_temp2, d_tempsum, d_k1s, d_k2s, d_k3s, d_k4s, d_k5s, d_k6s, d_numpoints, d_maxlength, d_nc, d_n2, d_ka, d_kb, d_kp, d_km, d_kn, d_kn2, d_concs, d_dt);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
    cudaDeviceSynchronize();

    //Copy 2D array of concentrations vs. time from GPU to Host.
    gpuErrchk( cudaMemcpy2D(concStorage, h_pitch, d_concStorage, d_pitch, numpoints*sizeof(double), maxlength, cudaMemcpyDeviceToHost) );   

    /*
    //Old arrays used to compare known value of e with calculated value of e.
    //Blah.

    double a[10];
    double b[10];
    double c[10];
    for(int i = 0; i< 10; i++)
    {
        a[i]=0;
        b[i]=0;
        c[i]=0;
    }
    */

    //Print out the concStorage array after the kernel runs.  Used to test that the 2D array transferred correctly from host to GPU and back.
    std::cout << "\n\n";
    std::cout << "Calculated Array";
    std::cout << "\n\n";
    for (int i=0; i<maxlength; i++)
    {
        for(int j=0; j<numpoints; j++)
        {
            if (j%(numpoints/10)==0)
            {
                //a[j/(numpoints/10)]=concStorage[i][j];
                std::cout<<concStorage[i][j];
                std::cout<<"   ";
            }
        }
        std::cout << "\n";
    }
    cudaDeviceReset();  //Clean up all memory.
    /*
    ofstream myfile;
    myfile.open ("example.txt");
    myfile << "Writing.";
    myfile.close();
    */

    return 0;
}
//Main kernel.  This is mean to be run as a master thread that calls all the other functions and thusly "runs" the integrator.
__global__ void rkf5(size_t pitch, double* concStorage, double* temp1, double* temp2, double* tempsum, double* k1s, double* k2s, double* k3s, double* k4s, double* k5s, double* k6s, int* numpoints, int* maxlength, int* nc, int* n2, double* ka, double* kb, double* kp, double* km, double* kn, double* kn2, double* concs, double* dt)
{
    /*
    axy variables represent the coefficients in the Butcher's tableau where x represents the order of the step and the y value corresponds to the ky value 
    the coefficient gets multiplied by.  Have to cast them all as doubles, or the ratios evaluate as integers.
    e.g. a21 -> a21 * k1
    e.g. a31 -> a31 * k1 + a32 * k2
    */
    double a21 = static_cast<double>(.25);

    double a31 = static_cast<double>(3)/static_cast<double>(32);
    double a32 = static_cast<double>(9)/static_cast<double>(32);

    double a41 = static_cast<double>(1932)/static_cast<double>(2197);
    double a42 = static_cast<double>(-7200)/static_cast<double>(2197);
    double a43 = static_cast<double>(7296)/static_cast<double>(2197);

    double a51 = static_cast<double>(439)/static_cast<double>(216);
    double a52 = static_cast<double>(-8);
    double a53 = static_cast<double>(3680)/static_cast<double>(513);
    double a54 = static_cast<double>(-845)/static_cast<double>(4104);

    double a61 = static_cast<double>(-8)/static_cast<double>(27);
    double a62 = static_cast<double>(2);
    double a63 = static_cast<double>(-3544)/static_cast<double>(2565);
    double a64 = static_cast<double>(1859)/static_cast<double>(4104);
    double a65 = static_cast<double>(-11)/static_cast<double>(40);

    //for loop that integrates over the specified number of points. Actually, might have to make it a do-while loop for adaptive step sizes 
    //for(int k = 0; k < 1; k++)
    for(int k = 0; k < *numpoints; k++)
    {
        if (k!=0)
        {
            arrCopy<<< 1, *maxlength >>>(concs, tempsum);
            cudaDeviceSynchronize();
        }
        arrInit<<< 1, *maxlength >>>(tempsum, 0);
        cudaDeviceSynchronize();
        arrInit<<< 1, *maxlength >>>(temp1, 0);
        cudaDeviceSynchronize();
        arrInit<<< 1, *maxlength >>>(temp2, 0);
        cudaDeviceSynchronize();

        calcK<<< 1, *maxlength >>>(maxlength, nc, n2, ka, kb, kp, km, kn, kn2, concs, k1s, dt);             //k1 = dt * flux (concs)
        cudaDeviceSynchronize(); //Sync here because kernel continues onto next line before k1 finished

        setup(a21, temp1, tempsum, k1s, concs, maxlength);      //tempsum = a21*k1
        arrAdd<<< 1, *maxlength >>>(concs, tempsum, tempsum);   //tempsum = concs + a21*k1    
        cudaDeviceSynchronize();

        calcK<<< 1, *maxlength >>>(maxlength, nc, n2, ka, kb, kp, km, kn, kn2, tempsum, k2s, dt);           //k2 = dt * flux (concs + a21*k1)
        cudaDeviceSynchronize();

        arrInit<<< 1, *maxlength >>>(tempsum, 0);
        cudaDeviceSynchronize();
        setup(a31, temp1, tempsum, k1s, concs, maxlength);      //temp1sum = a31*k1
        setup(a32, temp1, tempsum, k2s, concs, maxlength);      //tempsum = a31*k1 + a32*k2
        arrAdd<<< 1, *maxlength >>>(concs, tempsum, tempsum);   //tempsum = concs + a31*k1 + a32*k2
        cudaDeviceSynchronize();

        calcK<<< 1, *maxlength >>>(maxlength, nc, n2, ka, kb, kp, km, kn, kn2, tempsum, k3s, dt);           //k3 = dt * flux (concs + a31*k1 + a32*k2)
        cudaDeviceSynchronize();

        arrInit<<< 1, *maxlength >>>(tempsum, 0);
        cudaDeviceSynchronize();
        setup(a41, temp1, tempsum, k1s, concs, maxlength);      //tempsum = a41*k1
        setup(a42, temp1, tempsum, k2s, concs, maxlength);      //tempsum = a41*k1 + a42*k2
        setup(a43, temp1, tempsum, k3s, concs, maxlength);      //tempsum = a41*k1 + a42*k2 + a43*k3
        arrAdd<<< 1, *maxlength >>>(concs, tempsum, tempsum);   //tempsum = concs + a41*k1 + a42*k2 + a43*k3
        cudaDeviceSynchronize();

        calcK<<< 1, *maxlength >>>(maxlength, nc, n2, ka, kb, kp, km, kn, kn2, tempsum, k4s, dt);           //k4 = dt * flux (concs + a41*k1 + a42*k2 + a43*k3)
        cudaDeviceSynchronize();

        arrInit<<< 1, *maxlength >>>(tempsum, 0);
        cudaDeviceSynchronize();
        setup(a51, temp1, tempsum, k1s, concs, maxlength);  //tempsum = a51*k1
        setup(a52, temp1, tempsum, k2s, concs, maxlength);  //tempsum = a51*k1 + a52*k2
        setup(a53, temp1, tempsum, k3s, concs, maxlength);  //tempsum = a51*k1 + a52*k2 + a53*k3
        setup(a54, temp1, tempsum, k4s, concs, maxlength);  //tempsum = a51*k1 + a52*k2 + a53*k3 + a54*k4
        arrAdd<<< 1, *maxlength >>>(concs, tempsum, tempsum);   //tempsum = concs + a51*k1 + a52*k2 + a53*k3 + a54*k4
        cudaDeviceSynchronize();

        calcK<<< 1, *maxlength >>>(maxlength, nc, n2, ka, kb, kp, km, kn, kn2, tempsum, k5s, dt);           //k5 = dt * flux (concs + a51*k1 + a52*k2 + a53*k3 + a54*k4)
        cudaDeviceSynchronize();

        arrInit<<< 1, *maxlength >>>(tempsum, 0);
        cudaDeviceSynchronize();
        setup(a61, temp1, tempsum, k1s, concs, maxlength);  //tempsum = a61*k1
        setup(a62, temp1, tempsum, k2s, concs, maxlength);  //tempsum = a61*k1 + a62*k2
        setup(a63, temp1, tempsum, k3s, concs, maxlength);  //tempsum = a61*k1 + a62*k2 + a63*k3
        setup(a64, temp1, tempsum, k4s, concs, maxlength);  //tempsum = a61*k1 + a62*k2 + a63*k3 + a64*k4
        setup(a65, temp1, tempsum, k5s, concs, maxlength);  //tempsum = a61*k1 + a62*k2 + a63*k3 + a64*k4 + a65*k5
        arrAdd<<< 1, *maxlength >>>(concs, tempsum, tempsum);   //tempsum = concs + a61*k1 + a62*k2 + a63*k3 + a64*k4 + a65*k5
        cudaDeviceSynchronize();

        calcK<<< 1, *maxlength >>>(maxlength, nc, n2, ka, kb, kp, km, kn, kn2, tempsum, k6s, dt);           //k6 = dt * flux (concs + a61*k1 + a62*k2 + a63*k3 + a64*k4 + a65*k5)
        cudaDeviceSynchronize();

        //At this point, temp1 and tempsum are maxlength dimension arrays that are able to be used for other things.

        //Calculate acceptable step size before storing the concentrations.
        calcStepSize(temp1, temp2, tempsum, concs, k1s, k2s, k3s, k4s, k5s, k6s, dt, maxlength);    //temp1 = 4th Order guess, tempsum = 5th Order guess
        cudaDeviceSynchronize();

        //Store the initial conditions in the first column of the storage array.
        if (k==0)
        {
            storeConcs<<< 1, *maxlength >>>(concStorage, pitch, concs, k);  //Store this step's concentrations in 2D array
            cudaDeviceSynchronize();
        }
        //Store future concentration in next column of storage array.
        storeConcs<<< 1, *maxlength >>>(concStorage, pitch, tempsum, k+1);  //Store this step's concentrations in 2D array
        cudaDeviceSynchronize();
    }
}
//calcStepSize will take in an error tolerance, the current concentrations and the k values and calculate the resulting step size according to the following equation
//e[n+1]=y4[n+1] - y5[n+1]
__device__ void calcStepSize(double* temp1, double*temp2, double* tempsum, double* concs, double* k1s, double* k2s, double* k3s, double* k4s, double* k5s, double* k6s, double* dt, int* maxlength)
{
    //do
    //{
        takeFourthOrderStep<<< 1, *maxlength >>>(temp1, concs, k1s, k2s, k3s, k4s, k5s);            //Store 4th order guess in temp1
        takeFifthOrderStep<<< 1, *maxlength >>>(tempsum, concs, k1s, k2s, k3s, k4s, k5s, k6s);  //Store 5th order guess in tempsum
        cudaDeviceSynchronize();
        //arrSub<<< 1, *maxlength >>>(temp1, tempsum, temp2)
        //arrMin<<< 1, *maxlength >>>
    //arrMult
    //}
    //while
}
//takeFourthOrderStep is going to overwrite the old temp1 array with the new array of concentrations that result from a 4th order step.  This kernel is meant to be launched 
// with as many threads as there are discrete concentrations to be tracked.
__global__ void takeFourthOrderStep(double* y4, double* concs, double* k1s, double* k2s, double* k3s, double* k4s, double* k5s)
{
    double b41 = static_cast<double>(25)/static_cast<double>(216);
    double b42 = static_cast<double>(0);
    double b43 = static_cast<double>(1408)/static_cast<double>(2565);
    double b44 = static_cast<double>(2197)/static_cast<double>(4104);
    double b45 = static_cast<double>(-1)/static_cast<double>(5);
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    y4[idx] = concs[idx] + b41 * k1s[idx] + b42 * k2s[idx] + b43 * k3s[idx] + b44 * k4s[idx] + b45 * k5s[idx];
}
//takeFifthOrderStep is going to overwrite the old array of concentrations with the new array of concentrations.  As of now, this will be the 5th order step.  Another function can be d
//defined that will take a fourth order step if that is interesting for any reason.  This kernel is meant to be launched with as many threads as there are discrete concentrations
//to be tracked.
//Store b values in register? Constants?
__global__ void takeFifthOrderStep(double* y5, double* concs, double* k1s, double* k2s, double* k3s, double* k4s, double* k5s, double* k6s)
{
    double b51 = static_cast<double>(16)/static_cast<double>(135);
    double b52 = static_cast<double>(0);
    double b53 = static_cast<double>(6656)/static_cast<double>(12825);
    double b54 = static_cast<double>(28561)/static_cast<double>(56430);
    double b55 = static_cast<double>(-9)/static_cast<double>(50);
    double b56 = static_cast<double>(2)/static_cast<double>(55);
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    y5[idx] = concs[idx] + b51 * k1s[idx] + b52 * k2s[idx] + b53 * k3s[idx] + b54 * k4s[idx] + b55 * k5s[idx] + b56 * k6s[idx];
}
//storeConcs takes the current array of concentrations and stores it in the cId'th column of the 2D concStorage array
//pitch = memory size of a row
__global__ void storeConcs(double* cS, size_t pitch, double* concs, int cId)
{
    int tIdx = threadIdx.x;
    //cS is basically the memory address of the first element of the flattened (1D) 2D array.
    double* row = (double*)((char*)cS + tIdx * pitch);
    row[cId] = concs[tIdx];
}
//Perhaps I can optimize by using shared memory to hold conc values.
__global__ void calcK(int* maxlength, int* nc, int* n2, double* ka, double* kb, double* kp, double* km, double* kn, double* kn2, double* concs, double* ks, double* dt)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    ks[idx]=(*dt)*knowles_flux(idx, maxlength, nc, n2, ka, kb, kp, km, kn, kn2, concs);
}
//Adds two arrays (a + b) element by element and stores the result in array c.
__global__ void arrAdd(double* a, double* b, double* c)
{                                                 
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    c[idx]=a[idx]+b[idx];
}
//Subtracts two arrays (a - b) element by element and stores the result in array c.
__global__ void arrSub(double* a, double* b, double* c)
{                                                 
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    c[idx]=a[idx]-b[idx];
}
//Multiplies two arrays (a * b) element by element and stores the result in array c.
__global__ void arrMult(double* a, double* b, double* c)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    c[idx]=a[idx]*b[idx];
}
//Will find the min of errors array.
__global__ void arrMin(double* errors)
{
    //extern _shared_ double[7];
}
//Initializes an array a to double value b.
__global__ void arrInit(double* a, double b)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    a[idx]=b;
}
//Copies array b onto array a.
__global__ void arrCopy(double* a, double* b)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    a[idx]=b[idx];
}
//Placeholder function for the flux calculation.  It will take the size of the oligomer and current concentrations as inputs.
__device__ double flux(int r, double *concs) 
{
    return -concs[r];
}
//This function multiplies a tableau value by the corresponding k array and adds the result to tempsum.  Used to
//add all the a*k terms. concs not necessary
//e.g. setup(a21, temp1, tempsum, k1s, concs, maxlength) => tempsum = a21 * k1
__device__ void setup(double tableauValue, double *temp1, double *tempsum, double *ks, double *concs, int *maxlength) 
{
    //Sets tempsum to tabVal * k
    arrInit<<< 1, *maxlength >>>(temp1, tableauValue);      //Set [temp1] to tableau value, temp1 = a
    cudaDeviceSynchronize();
    arrMult<<< 1, *maxlength >>>(ks, temp1, temp1);         //Multiply tableau value by appropriate [k], temp1 = a*k
    cudaDeviceSynchronize();
    arrAdd<<< 1, *maxlength >>>(tempsum, temp1, tempsum);   //Move tabVal*k to [tempsum], tempsum = tempsum+temp1
    cudaDeviceSynchronize();
    //temp1 = tableauValue * kArray
    //tempsum = current sum (tableauValue * kArray)
}

//I need to use constants and replace these for loops with dynamic reductions.
__device__ double knowles_flux(int r, int* maxlength, int* nc, int* n2, double* ka, double* kb, double* kp, double* km, double* kn, double* kn2, double *conc)
{
    double frag_term = 0;
    double flux = 0;
    if (r == ((*maxlength)-1))
        {
        flux = -(*km)*(r)*conc[r]+2*(*ka)*conc[r-1]*conc[0];
        }
    else if (r > ((*nc)-1))
        {
        for (int s = r+1; s < (*maxlength); s++)
            {
            frag_term += conc[s];
            }
        flux = -(*km)*(r)*conc[r] + 2*(*km)*frag_term - 2*(*ka)*conc[r]*conc[0] + 2*(*ka)*conc[r-1]*conc[0];
        }
    else if (r == ((*nc)-1))
        {
        for (int s = r+1; s < (*maxlength); s++)
            {
            frag_term += conc[s];
            }
        flux = (*kn)*pow(conc[0],(*nc)) + 2*(*km)*frag_term - 2*(*ka)*conc[r]*conc[0];
        }
    else if (r < ((*nc)-1))
        {
        flux = 0;
        }
    return flux;
}

Answer 1

好的，我终于能够解决这个问题了。所以，如果有人遇到类似的错误，希望这会帮助你。

注意，在我的代码中，我正在使用二维数组，concStorage [maxlengths] [numpoints]。根据Robert Crovella的提示，问题出在我的CPU代码上，与堆栈变量有关。当然，我不知道堆栈变量是什么，但是堆栈的可用内存有限（~1MB），当你在函数中定义数组时，它们会占用那个存储空间。这是一对帮助我的链接。

这个让我意识到这是导致我的堆栈溢出的数组。 http://www.gamedev.net/topic/296695-stack-overflow-in-my-code-but-where/

这个帮助我解决了这个问题。 array with constant size (global vs stack)

解决方案：在我的#includes外部main（）下，我#defined

#define MAXLENGTH 2000
#define NUMPOINTS 1000
double concStorage[MAXLENGTH][NUMPOINTS];   //Stores concs [rows] vs. time [columns]

这让我可以在没有崩溃的情况下运行。我的理解是这个变量现在在堆而不是堆栈上。现在我想知道的是我应该像使用concStorage一样将所有数组移出main（），但就此问题而言，它现在似乎已经解决了。

CUDA应用程序.exe已停止工作

1 个答案: