C ++与64位整数的C#按位运算 - 性能

时间:2015-11-12 01:28:06

标签: c# c++ performance comparison bitwise-operators

我将2D字段存储在5个无符号长整数的数组中。 我期待最好的表现。 我在C#工作,但我试图通过在C ++中实现我的类来设置基准。

这里的问题是C#实现大约需要10秒才能完成C ++大约需要1秒的时间,使其快10倍。 C ++是VS2015中的x64内置版。 C#在x64 VS2015 .NET 4.6中。两者都在发布当然。

编辑:优化C#代码后,它仍然需要7到8秒,而C ++ 1.3秒。

注意: x86中的C ++大约需要6秒钟才能完成。我在64位机器上运行代码。

问题:是什么让C ++更快?有没有办法优化C#代码至少同样快? (也许是一些不安全的魔法?)

让我感到困惑的是,我们正在谈论迭代数组和按位运算。不应该像C ++那样JIT到几乎相同的东西吗?

示例代码: 实现中有两个简单的函数。左()和右()将整个字段向左移位1位。正确的,在长点之间有适当的位。

C ++

#include <iostream>
#include <chrono>
using namespace std;
using namespace std::chrono;

class BitField
{
private:
    unsigned long long LEFTMOST_BIT = 0x8000000000000000;
    unsigned long long RIGHTMOST_BIT = 1;

public:
    unsigned long long Cells_l[5];
    BitField()
    {
        for (size_t i = 0; i < 5; i++)
        {
            Cells_l[i] = rand(); // Random initialization
        }
    }
    void Left()
    {
        unsigned long long carry = 0;
        unsigned long long nextCarry = 0;
        for (int i = 0; i < 5; i++)
        {
            nextCarry = (Cells_l[i] & LEFTMOST_BIT) >> 63;
            Cells_l[i] = Cells_l[i] << 1 | carry;
            carry = nextCarry;
        }
    }
    void Right()
    {
        unsigned long long carry = 0;
        unsigned long long nextCarry = 0;
        for (int i = 4; i >= 0; i--)
        {
            nextCarry = (Cells_l[i] & RIGHTMOST_BIT) << 63;
            Cells_l[i] = Cells_l[i] >> 1 | carry;
            carry = nextCarry;
        }
    }
};

int main()
{
    BitField bf;

    high_resolution_clock::time_point t1 = high_resolution_clock::now();
    for (int i = 0; i < 100000000; i++)
    {
        bf.Left();
        bf.Left();
        bf.Left();
        bf.Right();
        bf.Right();
        bf.Left();
        bf.Right();
        bf.Right();
    }
    high_resolution_clock::time_point t2 = high_resolution_clock::now();

    auto duration = duration_cast<milliseconds>(t2 - t1).count();

    cout << "Time: " << duration << endl << endl;
    // Print to avoid compiler optimizations
    for (size_t i = 0; i < 5; i++)
    {
        cout << bf.Cells_l[i] << endl;
    }

    return 0;
}

C#

using System;
using System.Diagnostics;

namespace TestCS
{
    class BitField
    {
        const ulong LEFTMOST_BIT = 0x8000000000000000;
        const ulong RIGHTMOST_BIT = 1;

        static Random rnd = new Random();

        ulong[] Cells;

        public BitField()
        {
            Cells = new ulong[5];
            for (int i = 0; i < 5; i++)
            {
                Cells[i] = (ulong)rnd.Next(); // Random initialization
            }
        }

        public void Left()
        {
            ulong carry = 0;
            ulong nextCarry = 0;
            for (int i = 0; i < 5; i++)
            {
                nextCarry = (Cells[i] & LEFTMOST_BIT) >> 63;
                Cells[i] = Cells[i] << 1 | carry;
                carry = nextCarry;
            }
        }
        public void Right()
        {
            ulong carry = 0;
            ulong nextCarry = 0;
            for (int i = 4; i >= 0; i--)
            {
                nextCarry = (Cells[i] & RIGHTMOST_BIT) << 63;
                Cells[i] = Cells[i] >> 1 | carry;
                carry = nextCarry;
            }
        }
    }

    class Program
    {
        static void Main(string[] args)
        {
            BitField bf = new BitField();
            Stopwatch sw = new Stopwatch();

            // Call to remove the compilation time from measurements
            bf.Left();
            bf.Right();

            sw.Start();
            for (int i = 0; i < 100000000; i++)
            {
                bf.Left();
                bf.Left();
                bf.Left();
                bf.Right();
                bf.Right();
                bf.Left();
                bf.Right();
                bf.Right();
            }
            sw.Stop();

            Console.WriteLine($"Done in: {sw.Elapsed.TotalMilliseconds.ToString()}ms");
        }
    }
}

编辑:已修复&#34; nextCarry&#34;示例代码中的拼写错误。

2 个答案:

答案 0 :(得分:1)

部分差异可能是由于两个版本之间的代码差异 - 您没有在C ++ nextCarry或C#Left中分配给Right,但是这些可能是示例中的错别字。

您需要查看两者的反汇编以查看差异,但主要是因为C ++编译器有更多时间来优化代码。在这种情况下,它展开循环,内联所有函数调用(包括构造函数),并将Cells_l中的所有内容推送到寄存器中。所以有一个大循环使用寄存器而不能访问内存。

我没有看过C#编译的输出,但我怀疑它做了什么接近。

另外,正如评论中所提到的,将C#代码中的所有Cells.Length调用替换为5(就像在C ++代码中一样)。

答案 1 :(得分:1)

我从@AntoninLejsek的评论和删除的答案中得到了足够的信息,我可以自己回答。

TL; DR C ++编译器在循环中完成优化和C#托管阵列访问成本更高。但是,不安全的代码和固定访问不足以匹配C ++。

似乎我们需要手动优化C#代码以获得与C ++相当的性能。

  1. 展开循环
  2. 使用不安全的代码进行固定数组访问
  3. 不要反复访问数组 - 而是将项目存储到本地变量中。
  4. 以下C#代码的运行速度与C ++代码一样快(事实上快了约100毫秒)。在.NET 4.6 VS 2015 Release x64上编译。

    unsafe struct BitField
    {
        static Random rnd = new Random();
        public fixed ulong Cells[5];
        public BitField(int nothing)
        {
            fixed (ulong* p = Cells)
            {
                for (int i = 0; i < 5; i++)
                {
                    p[i] = (ulong)rnd.Next(); // Just some random number
                }
            }
        }
    public void StuffUnrolledNonManaged()
    {
            ulong u0;
            ulong u1;
            ulong u2;
            ulong u3;
            ulong u4;
            fixed (ulong *p = Cells)
            {
                u0 = p[0];
                u1 = p[1];
                u2 = p[2];
                u3 = p[3];
                u4 = p[4];
            }
            ulong carry = 0;
            ulong nextCarry = 0;
    
            for (int i = 0; i < 100000000; i++)
            {
    
                //left
                carry = 0;
                nextCarry = u0 >> 63;
                u0 = u0 << 1 | carry;
                carry = nextCarry;
                nextCarry = u1 >> 63;
                u1 = u1 << 1 | carry;
                carry = nextCarry;
                nextCarry = u2 >> 63;
                u2 = u2 << 1 | carry;
                carry = nextCarry;
                nextCarry = u3 >> 63;
                u3 = u3 << 1 | carry;
                carry = nextCarry;
                u4 = u4 << 1 | carry;
    
                //left
                carry = 0;
                nextCarry = u0 >> 63;
                u0 = u0 << 1 | carry;
                carry = nextCarry;
                nextCarry = u1 >> 63;
                u1 = u1 << 1 | carry;
                carry = nextCarry;
                nextCarry = u2 >> 63;
                u2 = u2 << 1 | carry;
                carry = nextCarry;
                nextCarry = u3 >> 63;
                u3 = u3 << 1 | carry;
                carry = nextCarry;
                u4 = u4 << 1 | carry;
    
                //left
                carry = 0;
                nextCarry = u0 >> 63;
                u0 = u0 << 1 | carry;
                carry = nextCarry;
                nextCarry = u1 >> 63;
                u1 = u1 << 1 | carry;
                carry = nextCarry;
                nextCarry = u2 >> 63;
                u2 = u2 << 1 | carry;
                carry = nextCarry;
                nextCarry = u3 >> 63;
                u3 = u3 << 1 | carry;
                carry = nextCarry;
                u4 = u4 << 1 | carry;
    
                //right
                carry = 0;
                nextCarry = u4 << 63;
                u4 = u4 >> 1 | carry;
                carry = nextCarry;
                nextCarry = u3 << 63;
                u3 = u3 >> 1 | carry;
                carry = nextCarry;
                nextCarry = u2 << 63;
                u2 = u2 >> 1 | carry;
                carry = nextCarry;
                nextCarry = u1 << 63;
                u1 = u1 >> 1 | carry;
                carry = nextCarry;
                u0 = u0 >> 1 | carry;
    
                //right
                carry = 0;
                nextCarry = u4 << 63;
                u4 = u4 >> 1 | carry;
                carry = nextCarry;
                nextCarry = u3 << 63;
                u3 = u3 >> 1 | carry;
                carry = nextCarry;
                nextCarry = u2 << 63;
                u2 = u2 >> 1 | carry;
                carry = nextCarry;
                nextCarry = u1 << 63;
                u1 = u1 >> 1 | carry;
                carry = nextCarry;
                u0 = u0 >> 1 | carry;
    
                //left
                carry = 0;
                nextCarry = u0 >> 63;
                u0 = u0 << 1 | carry;
                carry = nextCarry;
                nextCarry = u1 >> 63;
                u1 = u1 << 1 | carry;
                carry = nextCarry;
                nextCarry = u2 >> 63;
                u2 = u2 << 1 | carry;
                carry = nextCarry;
                nextCarry = u3 >> 63;
                u3 = u3 << 1 | carry;
                carry = nextCarry;
                u4 = u4 << 1 | carry;
    
                //right
                carry = 0;
                nextCarry = u4 << 63;
                u4 = u4 >> 1 | carry;
                carry = nextCarry;
                nextCarry = u3 << 63;
                u3 = u3 >> 1 | carry;
                carry = nextCarry;
                nextCarry = u2 << 63;
                u2 = u2 >> 1 | carry;
                carry = nextCarry;
                nextCarry = u1 << 63;
                u1 = u1 >> 1 | carry;
                carry = nextCarry;
                u0 = u0 >> 1 | carry;
    
                //right
                carry = 0;
                nextCarry = u4 << 63;
                u4 = u4 >> 1 | carry;
                carry = nextCarry;
                nextCarry = u3 << 63;
                u3 = u3 >> 1 | carry;
                carry = nextCarry;
                nextCarry = u2 << 63;
                u2 = u2 >> 1 | carry;
                carry = nextCarry;
                nextCarry = u1 << 63;
                u1 = u1 >> 1 | carry;
                carry = nextCarry;
                u0 = u0 >> 1 | carry;
    
            }
    
            fixed (ulong* p = Cells)
            {
                p[0] = u0;
                p[1] = u1;
                p[2] = u2;
                p[3] = u3;
                p[4] = u4;
            }
        }
    

    测试代码

    static void Main(string[] args)
            {
                BitField bf = new BitField(0);
                Stopwatch sw = new Stopwatch();
    
                // Call to remove the compilation time from measurements
                bf.StuffUnrolledNonManaged();
    
                sw.Start();
                bf.StuffUnrolledNonManaged();
                sw.Stop();
    
                Console.WriteLine($"Non managed access unrolled in: {sw.Elapsed.TotalMilliseconds.ToString()}ms");
            }
    

    此代码以 1.1秒结束。

    注意:只有固定数组访问不足以匹配C ++性能。如果我们不使用局部变量 - u0的每个实例都被p [0]等替换。时间大约是 3.6秒

    如果我们只使用带有问题代码的固定访问(在循环中调用Left()和Right()函数)。时间大约是 5.8秒