CUDA执行时间

时间:2012-09-19 22:19:58

标签: c++ cuda

我知道执行时间有点麻烦,我都准备尝试使用事件,time.h库和cutTimer,问题是它只打印到身份矩阵,其余的程序是大矩阵,内核调用和结果向量不打印。如果我删除了t1 = clock()t2 = clock(),它的打印效果非常好,我不知道我做错了什么,或者编码错了。

# include <iostream>
# include <conio.h>
# include <time.h>

using namespace std;

# define N 7

__global__ void mult(int *MAT, int *VEC, int *SOL, int b) {
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = 32 * bx + tx;
for (int j = 0; j < b; j++) {
    SOL[i] = ((MAT[i * b + j] * VEC[j]) + SOL[i]) % 2;
}
}

int main () {

int i, j, k, a, s, b;
int Q1[100][100], Q[100][100], Qg[100][100], MI[100][100];
int MAT[N][N], VEC[N], SOL[N];
int *MAT_dev, *VEC_dev, *SOL_dev;
int coef, aux[N], element;

clock_t t1, t2;

cout << "Size of the matrix: ";
cin >> k;
cout << endl << endl;

cont = 0;
a = k + 2;
b = (k * 2) + 1;
size_t nBytes = b * b * sizeof(int);

cudaMalloc((void**)&MAT_dev, nBytes);
cudaMalloc((void**)&VEC_dev, nBytes);
cudaMalloc((void**)&SOL_dev, nBytes);


//-----------------------------------------
//----- MATRIX 
//-----------------------------------------
// Matrix Q1.
for (i = 0; i < a; i++) {
    for (j = 0; j < a; j++) {
        Q1[i][j] = 0;
    }
}
//Matrix Q1 XOR.
Q1[0][1] = 1;
for (i = 0; i < k; i ++) {
    for (j = 0; j < k; j++) {
        Q1[i + 1][j + 1] = Q1[i][j] ^ Q1[i][j + 2];
    }
}

// Q1 to Q
for (i = 0; i < k; i++) {
    for (j = 0; j < k; j++) {
        Q[i][j] = Q1[i][j + 1];
    }
}

// Matrix Inverse Q.
for (i = 0; i < k; i++) 
    for (j = k; j < 2*k; j++) {
        if (i == (j-k))
            Q[i][j] = 1;
        else
            Q[i][j] = 0;
    }
    // Iterations
    for (s = 0; s < k; s++) {
        element = Q[s][s];
        for (j = 0; j < 2*k; j++)
            Q[s][j] = Q[s][j] / element;
        for (i = 0; i < k; i++) {
            if (i == s)
                ;
            else
            {
                coef = Q[i][s];
                for (j = 0; j < 2*k; j++)
                    aux[j] = Q[s][j] * (coef*-1);
                for (j = 0; j < 2*k; j++)
                    Q[i][j] = abs(Q[i][j] + aux[j]) % 2;
            }
        }
    }


//Print Matrix Q Inverse.
cout << endl << endl;
cout << "Inverse of Q.\n\n";
for (i = 0; i < k; i++) {
    for (j = k; j < k * 2; j++) {
        cout << Q[i][j] << " ";
    }
    cout << endl;
}
cout << endl << endl;

// Matrix Q Hat
cout << "Q Hat. \n\n";
for (i = 0; i < k; i++) {
    for (j = 0; j < k + 1; j++) {
        Qg[i][j] = Q[i + 1][j + k];
        if (i == (k - 1) || j == k)
            Qg[i][j] = 0;
        if (i == (k - 1) && j == k)
            Qg[i][j] = 1;
        cout << Qg[i][j]<< " ";
    }
    cout << endl;
}
cout << endl << endl;
// Matrix Identity
cout << "Matrix Identity.\n\n";
for (i = 0; i < k + 1; i++) {
    for (j = 0; j < k + 1; j++) {
        if (i == j)
            MI[i][j] = 1;
        else
            MI[i][j] = 0;
        cout << MI[i][j] << " ";
    }
    cout << endl;
}
cout << endl << endl;

//-----------------------------------------
//----- Big Marix 
//-----------------------------------------

// Big Matrix of Ceros
for(i = 0; i < b; i++) {
    for (j = 0; j < b; j++) {
        MAT[i][j] = 0;
    }
}
// Big Matrix - Matrix Inverse
for (i = 0; i < k; i++) {
    for (j = 0; j < k; j++) {
        MAT[i][j] = Q[i][j + k];
    }
}
// Big Matrix - Matrix Hat
for (i = 0; i < k; i++) {
    for (j = 0; j < k + 1; j++) {
        MAT[i][j + k] = Qg[i][j];
    }
}
// Big Matrix - Matrix Identity
for (i = 0; i < b; i++) {
    for (j = 0; j < (k + 1); j++) {
        MAT[i + k][j] = MI[i][j];
    }
}
// Print Big Matrix
cout << "Big Matrix. \n\n";
for (i = 0; i < b; i++) {
    for (j = 0; j < b; j++) {
        cout << MAT[i][j] << " ";
    }
    cout << endl;
}
cout << endl << endl;

//-----------------------------------------
//----- VECTOR 
//-----------------------------------------

// VECTOR.
cout << "Vector: " << endl;
for (i = 0; i < b; i++) {
    cin >> VEC[i];
}
cout << endl << endl;

//-----------------------------------------
//-----  Assign and Invocation
//-----------------------------------------
t1 = clock();
cudaMemcpy(MAT_dev, MAT, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(VEC_dev, VEC, nBytes, cudaMemcpyHostToDevice);

mult<<< 1, b >>>(MAT_dev, VEC_dev, SOL_dev, b);

cudaMemcpy(SOL, SOL_dev, nBytes, cudaMemcpyDeviceToHost);

for (i = 0; i < b; i++) {
    cout << SOL[i] << " ";
}
cout << endl;

for (i = 0; i < b; i++) {
    VEC[i] = SOL[i];
}

//-----------------------------------------
//----- Free Memory
//-----------------------------------------
cudaFree(MAT_dev);
cudaFree(VEC_dev);
cudaFree(SOL_dev);
t2 = clock();

cout << "Time of Execution: " << t2 - t1;
cout << endl;

system("PAUSE");
return 0;
}

感谢您的帮助。

2 个答案:

答案 0 :(得分:1)

count 变量在给定代码中未声明。 (实际上这个变量不是必需的) 否则,此代码中没有其他错误。它会正确执行并显示时间。请在计时器中添加cudaMalloc,并从计时器中排除输出打印。这将给出正确的CUDA运行时间。

答案 1 :(得分:0)

您可以尝试使用cuda-memcheck来运行您的程序吗?还是cuda-gdb呢?你可能会发现一些导致早期退出的问题。

相关问题