CuFFT时序约束

时间:2014-08-22 09:08:22

标签: c++ c cuda

我是CUDA的新手,我正在尝试使用CuFFT库在GPU上执行FFT。问题是,当我第一次运行编译的代码时需要大约500us,但是如果我立即再次运行它而不需要等待它需要大约175us。 (我从cmd运行.exe)。每当我暂停一下,计算FFT需要500us。对于不同的NX点DFT,它给出了几乎相同的时序。我正在计算从FFT开始到结束的时间,而不是数据复制时间。谁能告诉我这是什么问题。我期待的是高NX点FFT的执行时间更长,但它几乎占用了相同的时间。以下是我的代码。

我正在使用Visual C ++ 2010 Express和CUDA v6.0。系统规格:Corei7 3.60Ghz,RAM:16GB,GPU:GeForce GT640(使用相同的GPU进行显示和计算)

任何建议和帮助都将不胜感激。

// Raw Data Generation
#define TABLE_SIZE 1000
#define TWO_PI (3.14159 * 2)
#define CYCLES 20
#define NUMBER_OF_SAMPLES (TABLE_SIZE*CYCLES)

// FFT Values 
#define NX 2048  // NX-point DFT
#define BATCH 1 

// Sine Generator Function 
void sin_func(float *sample_ptr)
{ 
    float phaseIncrement = TWO_PI/TABLE_SIZE;
    float currentPhase = 0.0;
    int i;
    for (i = 0; i < CYCLES*TABLE_SIZE; i ++){
        *sample_ptr = sin(currentPhase);
        sample_ptr = sample_ptr + sizeof(float)/4;
        currentPhase += phaseIncrement;
    }
}


void main() 
{   
const int ARRAY_SIZE = NUMBER_OF_SAMPLES*sizeof(float);
const int FFT_OUT_SIZE = sizeof(cufftComplex)*(NX/2+1)*BATCH;

// Variable Declaration for execution time computation
    LARGE_INTEGER ticksPerSecond;
    LARGE_INTEGER startTick;   // A point in time
    LARGE_INTEGER starttime;   // For converting tick into real time        
    LARGE_INTEGER endTick;   // A point in time
    LARGE_INTEGER endtime;   // For converting tick into real time          

    // get the high resolution counter's accuracy
     QueryPerformanceFrequency(&ticksPerSecond);
//


// Initialization of input data on Host
float h_rawdata[NUMBER_OF_SAMPLES];
float h_checkdata[NUMBER_OF_SAMPLES];

sin_func(&h_rawdata[0]);
// Display values in the resulting array
for (int i =0; i < 12 ; i++) {
    printf("%f", h_rawdata[i]);
    printf(((i % 4) != 3) ? "\t" : "\n");
}


//Initializing output array on Host
cufftComplex h_fftout[FFT_OUT_SIZE];

//Allocate memory on GPU
float *d_rawdata;
float *d_checkdata;
cufftHandle plan; 
cufftComplex *d_fftout;

cudaMalloc((void**)&d_rawdata, ARRAY_SIZE);
cudaMalloc((void**)&d_checkdata, ARRAY_SIZE); // For Testing Only
cudaMalloc((void**)&d_fftout, FFT_OUT_SIZE);

//copying data to device(GPU) memory
cudaMemcpy (d_rawdata, h_rawdata, ARRAY_SIZE, cudaMemcpyHostToDevice);

// ** Doing FFT ** //
    if (cudaGetLastError() != cudaSuccess){
    fprintf(stderr, "Cuda error: Failed to allocate\n"); 
    return; 
} 
if (cufftPlan1d(&plan, NX, CUFFT_R2C, BATCH) != CUFFT_SUCCESS){
    fprintf(stderr, "CUFFT error: Plan creation failed"); 
    return; 
}   

// fft starting
QueryPerformanceCounter(&startTick);  // Time stamp at start of FFT

//if (cufftExecR2C(plan, (cufftReal*)d_rawdata, d_fftout) != CUFFT_SUCCESS){ 
if (cufftExecR2C(plan, d_rawdata, d_fftout) != CUFFT_SUCCESS){ 
    fprintf(stderr, "CUFFT error: ExecC2C Forward failed"); 
    return; 
} 
if (cudaDeviceSynchronize() != cudaSuccess){ 
    fprintf(stderr, "Cuda error: Failed to synchronize\n"); 
return; 
} 
QueryPerformanceCounter(&endTick); // Time stamp at end End of FFT


// ** Doing Inverse FFT ** //
if (cufftPlan1d(&plan, NX, CUFFT_C2R, BATCH) != CUFFT_SUCCESS){
    fprintf(stderr, "CUFFT error: Plan creation failed"); 
    return; 
}   
if (cufftExecC2R(plan, d_fftout, d_checkdata) != CUFFT_SUCCESS){ 
    fprintf(stderr, "CUFFT error: ExecC2C Forward failed"); 
    return; 
} 
if (cudaDeviceSynchronize() != cudaSuccess){ 
    fprintf(stderr, "Cuda error: Failed to synchronize\n"); 
return; 
} 

// Copying Data Back to Host
cudaMemcpy (h_fftout, d_fftout, FFT_OUT_SIZE, cudaMemcpyDeviceToHost);
cudaMemcpy (h_checkdata, d_checkdata, ARRAY_SIZE, cudaMemcpyDeviceToHost);

cufftDestroy(plan); 
cudaFree(d_rawdata);
cudaFree(d_fftout);

printf("\n");
// Displaying the resulting array
for (int i =0; i < 12 ; i++) {
    printf("%f", h_checkdata[i]/NX);
    printf(((i % 4) != 3) ? "\t" : "\n");
}

/// Ticks conversion    

// convert the tick number into the number of seconds
// since the system was started...
starttime.QuadPart = startTick.QuadPart/ticksPerSecond.QuadPart;
endtime.QuadPart = endTick.QuadPart/ticksPerSecond.QuadPart;

//get the number of hours
int starthours = starttime.QuadPart/3600;
int endhours = endtime.QuadPart/3600;

//get the number of minutes
starttime.QuadPart = starttime.QuadPart - (starthours * 3600);
endtime.QuadPart = endtime.QuadPart - (endhours * 3600);

int startminutes = starttime.QuadPart/60;
int endminutes = endtime.QuadPart/60;

//get the number of seconds
int startseconds = starttime.QuadPart - (startminutes * 60);
int endseconds = starttime.QuadPart - (endminutes *60);

double ticks_per_micro= (double)ticksPerSecond.QuadPart/1000000;
//printf ("\n div = %f",ticks_per_micro);

//get the number of Microseconds
double startmicroSecondes = (double)((startTick.QuadPart % ticksPerSecond.QuadPart) / ticks_per_micro);
double endmicroSecondes = (double)((endTick.QuadPart % ticksPerSecond.QuadPart) / ticks_per_micro);

printf ("\n FFT Started %d:%d:%d::%.2f",starthours, startminutes, startseconds, startmicroSecondes);
printf ("\n FFT Ended %d:%d:%d::%.2f \n",endhours, endminutes, endseconds, endmicroSecondes);

printf ("\nFFT computation time for %d point DFT: %.2fus \n", NX, endmicroSecondes - startmicroSecondes);
}

0 个答案:

没有答案