Question

我使用循环展开（LU）实现了三个版本的冒泡排序来研究c ++模板。即，没有循环展开，手动循环展开C宏，并循环展开模板。代码：

// No LU
void bubbleSort(int* data, int n)
{
    for(int i=n-1; i>0; --i) {
        for(int j=0; j<i; ++j)
            if (data[j]>data[j+1]) std::swap(data[j], data[j+1]);
    }
}
// LU with macro
void bubbleSort4(int* data)
{
#define COMP_SWAP(i, j) if(data[i]>data[j]) std::swap(data[i], data[j])
    COMP_SWAP(0, 1); COMP_SWAP(1, 2); COMP_SWAP(2, 3);
    COMP_SWAP(0, 1); COMP_SWAP(1, 2);
    COMP_SWAP(0, 1);
}
// LU with template
template<int i, int j>
inline void IntSwap(int* data) {
    if(data[i]>data[j]) std::swap(data[i], data[j]);
}
template<int i, int j>
inline void IntBubbleSortLoop(int* data) {
    IntSwap<j, j+1>(data);
    IntBubbleSortLoop<j<i-1?i:0, j<i-1?(j+1):0>(data);
}
template<>
inline void IntBubbleSortLoop<0, 0>(int*) { }
template<int n>
inline void IntBubbleSort(int* data) {
    IntBubbleSortLoop<n-1, 0>(data);
    IntBubbleSort<n-1>(data);
}
template<>
inline void IntBubbleSort<1>(int* data) { }

测试代码：

#include <iostream>
#include <utility>  //std::swap
#include <string.h> // memcpy
#include <sys/time.h>

class Timer{
    struct timeval start_t;
    struct timeval end_t;
    public: 
    void start() { gettimeofday(&start_t, NULL); }
    void end() { gettimeofday(&end_t, NULL); }
    void print() {
        std::cout << "Timer: " << 1000.0*(end_t.tv_sec-start_t.tv_sec)+(end_t.tv_usec-start_t.tv_usec)/1000.0 << " ms\n";
    }
};

int main() {
    Timer t1, t2, t3; const int num=100000000;
    int data[4]; int inidata[4]={3,4,2,1};

    t1.start();
    for(int i=0; i<num; ++i) { 
        memcpy(data, inidata, 4); 
        bubbleSort(data, 4); 
    }
    t1.end();
    t1.print();

    t2.start();
    for(int i=0; i<num; ++i) { 
        memcpy(data, inidata, 4); 
        bubbleSort4(data); 
    }
    t2.end();
    t2.print();

    t3.start();
    for(int i=0; i<num; ++i) { 
        memcpy(data, inidata, 4); 
        IntBubbleSort<4>(data);    
    }
    t3.end();
    t3.print();

    return 0;
}

我的平台是OSX，编译器是clang：

g++ -std=c++11 -o loop_unrolling loop_unrolling.cpp

我希望模板版本与宏版本具有相似的性能，这比普通版本快得多。但是，模板版本是最慢的。有谁知道为什么？

Timer: 1847.78 ms
Timer: 685.736 ms
Timer: 5075.86 ms

=================================

with -O1：

Timer: 861.071 ms
Timer: 495.001 ms
Timer: 2793.02 ms

与-O2：

Timer: 247.691 ms
Timer: 258.666 ms
Timer: 254.466 ms

with -O3：

Timer: 242.535 ms
Timer: 233.354 ms
Timer: 251.297 ms

令我困惑的是，无论我是否启用-Ox，模板版本都应该生成循环展开代码。但看起来并非如此。令我困惑的是，它甚至比非LP版本更慢，而不启用-Ox。

Answer 1

正如Jarod42指出你的问题是函数调用。从函数模板生成函数的编译器进程与内联的编译器进程是分开的。因此，无法保证内联，并且您在模板版本中支付函数调用的性能损失。请注意，inline关键字是编译器的提示，它没有义务尊重。

如果您将来遇到类似的问题并且感觉很勇敢，您可以查看代码生成的汇编代码以查看差异。上面的代码并将其拆分为两个编译单元b4.cc

// b4.cc
#include <algorithm>

// LU with macro
void bubbleSort4(int* data)
{
#define COMP_SWAP(i, j) if(data[i]>data[j]) std::swap(data[i], data[j])
    COMP_SWAP(0, 1); COMP_SWAP(1, 2); COMP_SWAP(2, 3);
    COMP_SWAP(0, 1); COMP_SWAP(1, 2);
    COMP_SWAP(0, 1);
}

和b4t.cc

// b4t.cc
#include <algorithm>

// LU with template
template<int i, int j>
inline void IntSwap(int* data) {
    if(data[i]>data[j]) std::swap(data[i], data[j]);
}
template<int i, int j>
inline void IntBubbleSortLoop(int* data) {
    IntSwap<j, j+1>(data);
    IntBubbleSortLoop<j<i-1?i:0, j<i-1?(j+1):0>(data);
}
template<>
inline void IntBubbleSortLoop<0, 0>(int*) { }
template<int n>
inline void IntBubbleSort(int* data) {
    IntBubbleSortLoop<n-1, 0>(data);
    IntBubbleSort<n-1>(data);
}
template<>
inline void IntBubbleSort<1>(int* data) { }

// LU with macro
void bubbleSort4(int* data)
{
    IntBubbleSort<4>(data);
}

这允许你然后g++ -S b4*.cc并比较b4.s和b4t.s.请注意，b4t.s充满了许多功能。而当您打开g++ -O2 -S b4*.cc中的内联时，程序集实际上是完全相同的。

C ++模板不会提高性能

1 个答案: