每次调用程序时,下面的代码执行2000万次,所以我需要一种方法来使代码尽可能优化。我不是一名高级c ++程序员,因此我寻求这个令人难以置信的社区帮助。
int WilcoxinRST::work(GeneSet originalGeneSet, vector<string> randomGenes) {
vector<string> geneIDs;
vector<bool> isOriginal;
vector<int> rank;
vector<double> value;
vector<int> score;
int genesPerSet = originalGeneSet.geneCount();
unsigned int totalGenes, tempScore;
/**
* Fill the first half of the vectors with original gene set data
*/
totalGenes = genesPerSet * 2;
for (int i = 0; i < genesPerSet; i++) {
geneIDs.push_back(originalGeneSet.getMemberGeneAt(i));
isOriginal.push_back(true);
value.push_back(fabs(expressionLevels.getValue(geneIDs[i], statType)));
}
/**
* Fill the second half with random data
*/
for (unsigned int i = genesPerSet; i < totalGenes; i++) {
geneIDs.push_back(randomGenes.at(i - genesPerSet));
isOriginal.push_back(false);
value.push_back(fabs(expressionLevels.getValue(geneIDs[i], statType)));
}
totalGenes = geneIDs.size();
/**
* calculate the scores
*/
if (statType == Fold_Change || statType == T_Statistic
|| statType == Z_Statistic) {
// Higher value is a winner
for (unsigned int i = 0; i < totalGenes; i++) {
tempScore = 0;
if (!isOriginal[i]) {
for (int j = 0; j < genesPerSet; j++) {
if (value.at(i) > value.at(j)) {
tempScore++;
}
}
} else {
for (unsigned int j = genesPerSet; j < totalGenes; j++) {
if (value.at(i) > value.at(j)) {
tempScore++;
}
}
}
score.push_back(tempScore);
}
} else if (statType == FDR_PValue || statType == PValue) {
// Lower value is a winner
for (unsigned int i = 0; i < totalGenes; i++) {
tempScore = 0;
if (!isOriginal[i]) {
for (int j = 0; j < genesPerSet; j++) {
if (value.at(i) < value.at(j)) {
tempScore++;
}
}
} else {
for (unsigned int j = genesPerSet; j < totalGenes; j++) {
if (value.at(i) < value.at(j)) {
tempScore++;
}
}
}
score.push_back(tempScore);
}
} else {
cout << endl << "ERROR. Statistic type not defined." << endl;
}
/**
* calculate Ua, Ub and U
*/
int U_Original = 0, U_Random = 0, U_Final;
for (int j = 0; j < genesPerSet; j++) {
U_Original += score[j];
}
for (unsigned int j = genesPerSet; j < totalGenes; j++) {
U_Random += score[j];
}
U_Final = (U_Original < U_Random) ? U_Original : U_Random;
/**
* calculate z
*/
double Zn, Zd, Z;
Zn = U_Final - ((genesPerSet * genesPerSet) / 2);
Zd = sqrt(
(double) (((genesPerSet * genesPerSet
* (genesPerSet + genesPerSet + 1)))) / 12.0);
Z = Zn / Zd;
/**
* Return 0/1/2
* 2: p value < 0.01
* 1: 0.01 < p value < 0.05
* 0: p value > 0.05
*/
if (fabs(Z) > 2.303)
return 2;
else if (fabs(Z) > 1.605)
return 1;
else
return 0;
}
答案 0 :(得分:2)
您的代码的复杂度为O(N * N)[genesPerSet
= N]。但是使用值的顺序与您无关的事实我们可以在O(N•log(N))中对其进行排序并计算O(N)中的“得分”。 (可能会快几千倍)。
此外,我们总共有N * N个比较。然后U_Original + U_Random
= N * N,这意味着我们不需要计算U_Random。
你的统计量Zn = Umin-N * N / 2; [Umix=min(U_Original,U_Random)],
当你只有abs(Zn / Zd)在N * N / 2附近对称时。我们只需要一种算法。
1.-首先可以通过(const)引用来获取参数:
int WilcoxinRST::work(const GeneSet &originalGeneSet, const vector<string> &randomGenes)
2.-填写载体geneIDs;但是不要用它?为什么呢?
3.-您只能迭代2次。
4.-您将信号值(探测强度?)保持在一个向量中,并使用另一个向量来表示每个项目的内容 - 简单地保留在两个向量中。
5.-您不需要得分矢量,只需要摘要。
6.-为什么要2000万次?我gess你正在计算一些“统计”稳定性或BStrap。可能你多次使用相同的originalGeneSet。我想你可以在另一个问题中发布调用这个函数的代码,以便每次都能使值向量和排序。
这是第一个新的O(N•log(N))代码。
以下是对代码的清理,但仍然是O(N * N),快速但仅由常数因子决定。
然后是相同的代码,但与您的原始代码和更多评论混合在一起。
请调试这个并告诉我是怎么回事。
#include<vector>
#include<algorithm>
int WilcoxinRST::work(const GeneSet &originalGeneSet , const vector<string>& randomGenes)
{
size_t genesPerSet = originalGeneSet.geneCount();
std::vector<double> valueOri(genesPerSet), valueRnd(genesPerSet);
/**
* Fill the valueOri vector with original gene set data, and valueRnd with random data
*/
for (size_t i = 0; i < genesPerSet; i++)
{
valueOri[i] = std::fabs(expressionLevels.getValue( originalGeneSet.getMemberGeneAt(i) , statType ));
valueRnd[i] = std::fabs(expressionLevels.getValue( randomGenes.at(i) , statType ));
}
std::sort(valueOri.begin(),valueOri.end());
std::sort(valueRnd.begin(),valueRnd.end());
/**
* calculate the scores Ua, Ub and U
*/
long U_Original ;
if (statType == Fold_Change || statType == T_Statistic || statType == Z_Statistic
statType == FDR_PValue || statType == PValue )
{
// Higher value is a winner
size_t j=0;
for (size_t i = 0; i < genesPerSet /*totalGenes*/; i++) // i - 2x
{
while(valueOri[i] > valueRnd[j]) ++j ;
U_Original += j;
}
} else { cout << endl << "ERROR. Statistic type not defined." << endl; }
/**
* calculate z
*/
double Zn, Zd, Z;
Zn = U_Original - ((genesPerSet * genesPerSet) / 2);
Zd = std::sqrt( (double) (((genesPerSet * genesPerSet* (genesPerSet + genesPerSet + 1)))) / 12.0);
Z = Zn / Zd;
/**
* Return 0/1/2
* 2: p value < 0.01
* 1: 0.01 < p value < 0.05
* 0: p value > 0.05
*/
if (std::fabs(Z) > 2.303) return 2;
else if (std::fabs(Z) > 1.605) return 1;
else return 0;
}
以下是对代码的清理,但仍然是O(N * N),快速但仅由常数因子决定。
#include<vector>
using namespace std;
class GeneSet ;
class WilcoxinRST;
int WilcoxinRST::work(const GeneSet &originalGeneSet , const vector<string>& randomGenes)
{
size_t genesPerSet = originalGeneSet.geneCount();
vector<double> valueOri(genesPerSet), valueRnd(genesPerSet);
/**
* Fill the valueOri vector with original gene set data, and valueRnd with random data
*/
for (size_t i = 0; i < genesPerSet; i++)
{
valueOri[i] = fabs(expressionLevels.getValue( originalGeneSet.getMemberGeneAt(i) , statType ));
valueRnd[i] = fabs(expressionLevels.getValue( randomGenes.at(i) , statType ));
}
/**
* calculate the scores Ua, Ub and U
*/
long U_Original = 0, U_Random = 0, U_Final;
if (statType == Fold_Change || statType == T_Statistic || statType == Z_Statistic)
{
// Higher value is a winner
for (size_t i = 0; i < genesPerSet /*totalGenes*/; i++) // i - 2x
{ for (size_t j = 0; j < genesPerSet; j++)
{ U_Random += (valueRnd[i] > valueOri[j]);// i en 2 set=Rnd, j in 1 set=Ori. count how many Ori are less than this Rnd
U_Original+= (valueOri[i] > valueRnd[j]);// i in 1 set=Ori, j in 2 set=Rnd, count how many Rnd are less than this Ori
}
}
} else
if (statType == FDR_PValue || statType == PValue)
{
// Lower value is a winner
for (size_t i = 0; i < genesPerSet; i++)
{
for (size_t j = 0; j < genesPerSet; j++)
{ U_Random += (valueRnd[i] < valueOri[j]);// i en 2 set=Rnd, j in 1 set=Ori. count how many Ori are > than this Rnd
U_Original+= (valueOri[i] < valueRnd[j]);// i in 1 set=Ori, j in 2 set=Rnd, count how many Rnd are > than this Ori
}
}
} else { cout << endl << "ERROR. Statistic type not defined." << endl; }
U_Final = (U_Original < U_Random) ? U_Original : U_Random;
/**
* calculate z
*/
double Zn, Zd, Z;
Zn = U_Final - ((genesPerSet * genesPerSet) / 2);
Zd = sqrt(
(double) (((genesPerSet * genesPerSet
* (genesPerSet + genesPerSet + 1)))) / 12.0);
Z = Zn / Zd;
/**
* Return 0/1/2
* 2: p value < 0.01
* 1: 0.01 < p value < 0.05
* 0: p value > 0.05
*/
if (fabs(Z) > 2.303) return 2;
else if (fabs(Z) > 1.605) return 1;
else return 0;
}
相同的代码,但与您的原始代码和更多评论混合在一起。
int WilcoxinRST::work(const GeneSet &originalGeneSet , const vector<string>& randomGenes)
{
size_t genesPerSet = originalGeneSet.geneCount();
unsigned int totalGenes, tempScore;
totalGenes = genesPerSet * 2;
//vector<string> geneIDs;
//vector<bool> isOriginal;
//vector<int> rank;
vector<double> valueOri(genesPerSet), valueRnd(genesPerSet);
//vector<int> score;
/**
* Fill the first half of the vectors with original gene set data
*/
for (size_t i = 0; i < genesPerSet; i++)
{
//geneIDs.push_back( originalGeneSet.getMemberGeneAt(i) );
//isOriginal.push_back(true);
valueOri[i] = fabs(expressionLevels.getValue( originalGeneSet.getMemberGeneAt(i) , statType ));
valueRnd[i] = fabs(expressionLevels.getValue( randomGenes.at(i) , statType ));
}
/**
* Fill the second half with random data
*/
//for (unsigned int i = genesPerSet; i < totalGenes; i++) {
// geneIDs.push_back(randomGenes.at(i - genesPerSet));
// isOriginal.push_back(false);
// value.push_back(fabs(expressionLevels.getValue(geneIDs[i], statType)));
//}
//totalGenes = geneIDs.size();
/**
* calculate the scores
*/
/**
* calculate Ua, Ub and U
*/
long U_Original = 0, U_Random = 0, U_Final;
//for (int j = 0; j < genesPerSet; j++) // j in 1 set=Ori. count how many Ori are less than this Rnd
//{
// U_Original += score[j];
//}
//for (unsigned int j = genesPerSet; j < totalGenes; j++) // j in 2 set=Rnd, count how many Rnd are less than this Ori
//{
// U_Random += score[j];
//}
if (statType == Fold_Change || statType == T_Statistic || statType == Z_Statistic)
{
// Higher value is a winner
for (size_t i = 0; i < genesPerSet /*totalGenes*/; i++) // i - 2x
{ //tempScore = 0;
//if (!isOriginal[i]) // i en 2 set=Rnd, j in 1 set=Ori. count how many Ori are less than this Rnd
for (size_t j = 0; j < genesPerSet; j++)
{ U_Random += (valueRnd[i] > valueOri[j]);// i en 2 set=Rnd, j in 1 set=Ori. count how many Ori are less than this Rnd
U_Original+= (valueOri[i] > valueRnd[j]);// i in 1 set=Ori, j in 2 set=Rnd, count how many Rnd are less than this Ori
}
//} else
//{
// for (unsigned int j = genesPerSet; j < totalGenes; j++) // i in 1 set=Ori, j in 2 set=Rnd, count how many Rnd are less than this Ori
// { if (value.at(i) > value.at(j)) { tempScore++; }
// }
//}
//score.push_back(tempScore);
}
} else
if (statType == FDR_PValue || statType == PValue)
{
// Lower value is a winner
for (size_t i = 0; i < genesPerSet; i++)
{
for (size_t j = 0; j < genesPerSet; j++)
{ U_Random += (valueRnd[i] < valueOri[j]);// i en 2 set=Rnd, j in 1 set=Ori. count how many Ori are > than this Rnd
U_Original+= (valueOri[i] < valueRnd[j]);// i in 1 set=Ori, j in 2 set=Rnd, count how many Rnd are > than this Ori
}
//} else
//{
// for (unsigned int j = genesPerSet; j < totalGenes; j++) // i in 1 set=Ori, j in 2 set=Rnd, count how many Rnd are less than this Ori
// { if (value.at(i) > value.at(j)) { tempScore++; }
// }
//}
//score.push_back(tempScore);
}
//for (unsigned int i = 0; i < totalGenes; i++)
//{ tempScore = 0;
// if (!isOriginal[i])
// { for (int j = 0; j < genesPerSet; j++) {
// if (value.at(i) < value.at(j)) { // Rnd i < Ori j increm U_Random
// tempScore++;
// }
// }
// } else {
// for (unsigned int j = genesPerSet; j < totalGenes; j++) { // Ori i < Rnd j. Increm U_Original
// if (value.at(i) < value.at(j)) {
// tempScore++;
// }
// }
// }
// score.push_back(tempScore);
//}
} else { cout << endl << "ERROR. Statistic type not defined." << endl; }
U_Final = (U_Original < U_Random) ? U_Original : U_Random;
/**
* calculate z
*/
double Zn, Zd, Z;
Zn = U_Final - ((genesPerSet * genesPerSet) / 2);
Zd = sqrt(
(double) (((genesPerSet * genesPerSet
* (genesPerSet + genesPerSet + 1)))) / 12.0);
Z = Zn / Zd;
/**
* Return 0/1/2
* 2: p value < 0.01
* 1: 0.01 < p value < 0.05
* 0: p value > 0.05
*/
if (fabs(Z) > 2.303)
return 2;
else if (fabs(Z) > 1.605)
return 1;
else
return 0;
}