编辑

Question

这样做速度更快吗？：

return m_bVisible && (_colorTransform.alphaTransform > 0 || _colorTransform.colorOffsetTransform.a > 0) && (!_masked || _maskVisitable);

比这样做吗？：

if (UNLIKELY(_colorTransform.alphaTransform == 0 && _colorTransform.colorOffsetTransform.a == 0))
{
    return false;
}

if (UNLIKELY(_masked && !_maskVisitable))
{
    return false;
}

return m_bVisible;

做了很多次要的优化，极大地提高了我们游戏的帧率性能，这是我不确定的。我可以获得0.01％的性能提升，因为在进行了100次这些优化后，我已经能够显着提高性能（30-40％）。询问使用UNLIKELY优化而不是复合布尔。在此特定表达式中，短路并非易事。

Answer 1

我想说第一个更快！

即使您对第二个分支预测进行“优化”，这基本上也意味着return m_bVisible;在ASM代码中的出现要早于return false，但您仍然需要执行多达四个操作您的if语句中进行比较。

但是，正如您所说，由于它们非常不适合，因此在实际查看m_bVisible之前不需要这样做。第一个示例首先检查m_bVisible，似乎对FALSE或TRUE都没有偏见。

我的直觉是，对于m_bVisible == false的所有情况，第一个会更快。在所有其他情况下，除了第二个示例中的JMP的最小开销（要求您跳过return m_bVisible;才能到达实际的return false）之外，没有什么太大的区别。

编辑

让我们更深入地了解两个变体的x86_64程序集（gcc 8）：

测试代码：

#include <stdio.h>

bool a=true; int b=0; int c=0; bool d=false; bool e=true;
#define likely(x)       __builtin_expect((x),1)
#define unlikely(x)     __builtin_expect((x),0)

int main()
{
marker1:
    return a && (b > 0 || c > 0) && (!d || e);
}


int alternative()
{
    if (unlikely(b == 0 && c == 0))
    {
        return false;
    }

    if (unlikely(d && !e))
    {
        return false;
    }

    return a;
}

X86_86 Assember（GCC 8.2）的两个功能：

main（）

main:
    push    rbp
    mov     rbp, rsp
    movzx   eax, BYTE PTR a[rip]
    test    al, al
    je      .L3
    mov     eax, DWORD PTR b[rip]
    test    eax, eax
    jg      .L4
    mov     eax, DWORD PTR c[rip]
    test    eax, eax
    jle     .L3
.L4:
    movzx   eax, BYTE PTR d[rip]
    xor     eax, 1
    test    al, al
    jne     .L5
    movzx   eax, BYTE PTR e[rip]
    test    al, al
    je      .L3
.L5:
    mov     eax, 1
    jmp     .L6
.L3:
    mov     eax, 0
.L6:
    movzx   eax, al
    pop     rbp
    ret

alternative（）

alternative():
    push    rbp
    mov     rbp, rsp
    mov     eax, DWORD PTR b[rip]
    test    eax, eax
    sete    al
    movzx   eax, al
    test    rax, rax
    je      .L9
    mov     eax, DWORD PTR c[rip]
    test    eax, eax
    sete    al
    movzx   eax, al
    test    rax, rax
    je      .L9
    mov     eax, 0
    jmp     .L10
.L9:
    movzx   eax, BYTE PTR d[rip]
    movzx   eax, al
    test    rax, rax
    je      .L11
    movzx   eax, BYTE PTR e[rip]
    xor     eax, 1
    movzx   eax, al
    test    rax, rax
    je      .L11
    mov     eax, 0
    jmp     .L10
.L11:
    movzx   eax, BYTE PTR a[rip]
    movzx   eax, al
.L10:
    pop     rbp
    ret

在第一个变体的main（）中，最快的出口在第五行je .L3处。在第二个变体中，最快的退出发生的时间要晚得多……在{L1}中的.L9中

EDIT2

只需完成它，这里是-O3的组装

je      .L11

和

main:
        xor     eax, eax
        cmp     BYTE PTR a[rip], 0
        je      .L1
        cmp     DWORD PTR b[rip], 0
        jle     .L10
.L4:
        cmp     BYTE PTR d[rip], 0
        mov     eax, 1
        je      .L1
        movzx   eax, BYTE PTR e[rip]
.L1:
        ret
.L10:
        cmp     DWORD PTR c[rip], 0
        jg      .L4
        ret

即使有内联，第一个在O3上似乎仍然更快！

复合布尔值与不太可能/可能的性能

1 个答案:

编辑

EDIT2