Question

我正在编写C ++代码来查找内存中非0xFF的第一个字节。为了利用bitscanforward，我编写了一个我非常喜欢的内联汇编代码。但是对于“可读性”以及未来的校对（即SIMD矢量化），我想我会给g ++优化器一个机会。 g ++没有矢量化，但确实达到了我所做的几乎相同的非SIMD解决方案。但由于某种原因，它的版本运行速度慢得多，速度慢260000倍（即我必须循环我的版本260,000x才能达到相同的执行时间）。我除了一些差异，但不是那么多！有人可以指出它为什么会这样吗？我只想知道在未来的内联汇编代码中出错。

C ++的起点如下，（就计算准确性而言，此代码中存在一个错误，但我已将其简化为此速度测试）：

uint64_t count3 (const void *data, uint64_t const &nBytes) {
      uint64_t count = 0;
      uint64_t block;
      do {
         block = *(uint64_t*)(data+count);
         if ( block != (uint64_t)-1 ) {
/*       count += __builtin_ctz(~block);   ignore this for speed test*/
            goto done;
          };
        count += sizeof(block);
      } while ( count < nBytes );
done:
      return (count>nBytes ? nBytes : count);
}

汇编代码g ++提出的是：

_Z6count3PKvRKm:
.LFB33:
    .cfi_startproc
    mov rdx, QWORD PTR [rsi]
    xor eax, eax
    jmp .L19
    .p2align 4,,10
    .p2align 3
.L21:
    add rax, 8
    cmp rax, rdx
    jnb .L18
.L19:
    cmp QWORD PTR [rdi+rax], -1
    je  .L21
.L18:
    cmp rax, rdx
    cmova   rax, rdx
    ret
    .cfi_endproc

我的内联汇编是

_Z6count2PKvRKm:
.LFB32:
    .cfi_startproc
    push    rbx
    .cfi_def_cfa_offset 16
    .cfi_offset 3, -16
    mov rbx, QWORD PTR [rsi]

    # count trailing bytes of 0xFF 
    xor     rax, rax  
.ctxff_loop_69:          
    mov     r9,  QWORD PTR [rdi+rax] 
    xor     r9, -1          
    jnz   .ctxff_final_69    
    add     rax, 8     
    cmp     rax, rbx 
    jl    .ctxff_loop_69    
.ctxff_final_69:         
    cmp     rax,rbx  
    cmova   rax,rbx  
    pop rbx
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc

据我所知，除了将数据字节与0xFF进行比较的方法外，它基本相同。但我不敢相信这会导致计算时间的巨大差异。

可以想象我的测试方法导致错误，但我所做的只是更改下面的函数名称和迭代长度，简单的for循环如下所示:(当N为1 <＆lt;＆lt;＆lt;＆lt;＆lt;＆lt;＆lt;＆lt;＆lt;＆lt; 'a'除了最后一个字节是0xFF）

测试1

   for (uint64_t i=0; i < ((uint64_t)1<<15); i++) {
      n = count3(a,N);
   }

测试2

   for (uint64_t i=0; i < ((uint64_t)1<<33); i++) {
      n = count2(a,N);
   }

编辑：

以下是我的实际内联汇编代码，包含SSE count1()，x64-64 count()，然后是普通旧版本的c count0()和count3()。我摔倒了这个兔子洞，希望我能得到g ++来接受我的count0()并自己到达我的count1()甚至count2()。但是它没有做任何事情，绝对没有优化:(我应该补充一点，我的平台没有AVX2，这就是为什么我希望让g ++自动进行矢量化，这样代码会在我更新平台时自动更新。 / p>

就内联汇编中的显式寄存器使用而言，如果我没有明确地使用它们，g ++将为nBytes和count重用相同的寄存器。

就加速而言，在XMM和QWORD之间，我发现真正的好处只是“循环展开”效果，我在count2()复制。

uint32_t count0(const uint8_t *data, uint64_t const &nBytes) {

  for (int i=0; i<nBytes; i++)
    if (data[i] != 0xFF) return i;

  return nBytes;
}
uint32_t count1(const void *data, uint64_t const &nBytes) {
  uint64_t count;
  __asm__("# count trailing bytes of 0xFF \n"
    "   xor     %[count], %[count]  \n"
    " vpcmpeqb  xmm0, xmm0, xmm0  \n" // make array of 0xFF

    ".ctxff_next_block_%=:        \n"
    " vpcmpeqb  xmm1, xmm0, XMMWORD PTR [%[data]+%[count]]  \n"
    " vpmovmskb r9, xmm1         \n"
    " xor     r9, 0xFFFF       \n" // test if all match (bonus negate r9)
    " jnz   .ctxff_tzc_%=        \n" // if !=0, STOP & tzcnt negated r9
    " add     %[count], 16       \n" // else inc
    " cmp     %[count], %[nBytes] \n"
    " jl    .ctxff_next_block_%=  \n" // while count < nBytes, loop
    " jmp   .ctxff_done_%=      \n" // else done + ALL bytes were 0xFF

    ".ctxff_tzc_%=:           \n"
    " tzcnt   r9, r9          \n" // count bytes up to non-0xFF
    " add     %[count], r9    \n"

    ".ctxff_done_%=:          \n" // more than 'nBytes' could be tested,
    " cmp     %[count],%[nBytes]  \n" // find minimum
    " cmova   %[count],%[nBytes]  "
    : [count] "=a" (count)
    : [nBytes] "b" (nBytes), [data] "d" (data)
    : "r9", "xmm0", "xmm1"
  );
  return count;
};

uint64_t count2 (const void *data, uint64_t const &nBytes) {
    uint64_t count;
  __asm__("# count trailing bytes of 0xFF \n"
    "    xor     %[count], %[count]  \n"

    ".ctxff_loop_%=:          \n"
    "    mov     r9,  QWORD PTR [%[data]+%[count]] \n"
    "    xor     r9, -1          \n" 
    "    jnz   .ctxff_final_%=    \n"
    "    add     %[count], 8     \n" 
    "    mov     r9,  QWORD PTR [%[data]+%[count]] \n"  // <--loop-unroll
    "    xor     r9, -1          \n" 
    "    jnz   .ctxff_final_%=    \n"
    "    add     %[count], 8     \n" 
    "    cmp     %[count], %[nBytes] \n"
    "    jl    .ctxff_loop_%=    \n"
    "    jmp   .ctxff_done_%=   \n" 

    ".ctxff_final_%=:            \n"
    "    bsf   r9,  r9           \n" // do tz count on r9 (either of first QWORD bits or XMM bytes)
    "    shr     r9,  3          \n" // scale BSF count accordiningly
    "    add     %[count], r9    \n"
    ".ctxff_done_%=:          \n" // more than 'nBytes' bytes could have been tested,
    "    cmp     %[count],%[nBytes]  \n" // find minimum of count and nBytes
    "    cmova   %[count],%[nBytes]  "
    : [count] "=a" (count)
    : [nBytes] "b" (nBytes), [data] "D" (data)
    : "r9"
  );
  return count;
}

inline static uint32_t tzcount(uint64_t const &qword) {
  uint64_t tzc;
  asm("tzcnt %0, %1" : "=r" (tzc) : "r" (qword) );
  return tzc;
};

uint64_t count3 (const void *data, uint64_t const &nBytes) {
      uint64_t count = 0;
      uint64_t block;
      do {
        block = *(uint64_t*)(data+count);
         if ( block != (uint64_t)-1 ) {
           count += tzcount(~block);
            goto done;
          };
        count += sizeof(block);
      } while ( count < nBytes );
done:
      return (count>nBytes ? nBytes : count);
}

uint32_t N = 1<<20;

int main(int argc, char **argv) {

  unsigned char a[N];
  __builtin_memset(a,0xFF,N);

  uint64_t n = 0, j;
   for (uint64_t i=0; i < ((uint64_t)1<<18); i++) {
      n += count2(a,N);
   }

 printf("\n\n %x %x %x\n",N, n, 0);   
  return n;
}

Answer 1

回答问题标题

现在您已经发布了完整的代码： count2(a,N) 中的main呼叫被提升出来。循环计数（例如1<<18）的运行时间仍然略有增加，但所有循环正在进行的是单个add。编译器优化它看起来更像这个源：

uint64_t hoisted_count = count2(a,N);
for (uint64_t i=0; i < ((uint64_t)1<<18); i++) {
   n += hoisted_count;   // doesn't optimize to a multiply
}

没有寄存器冲突：%rax保存从count2内联的asm语句的结果。然后它被用作微循环中的源操作数，通过重复添加将其乘以n。

（请参阅Godbolt Compiler Explorer上的asm，并注意有关void*的算术的所有编译器警告：clang拒绝编译代码）：

## the for() loop in main, when using count2()
.L23:
    addq    %rax, %r12
    subq    $1, %rdx
    jne     .L23

%rdx是循环计数器，%r12是保存n的累加器。 IDK为什么gcc没有将它优化为恒定时间乘法。

据推测，速度低260k的版本并没有设法将整个count2提升出来。从gcc的角度来看，内联asm版本要简单得多：asm语句被视为其输入的纯函数，而gcc甚至不知道它触及内存的任何信息。 C版本触及了大量内存，并且证明它可以被提升要复杂得多。

在asm语句中使用"memory" clobber确实阻止了当我检查godbolt时它被悬挂。您可以在向量块之前的main中判断是否存在分支目标。

但无论如何，运行时间将类似于n + rep_count与n * rep_count 。

asm语句不使用"memory" clobber或任何内存输入来告诉gcc它读取输入指针指向的内存。 可能会发生不正确的优化，例如从一个修改过数组元素的循环中被提升出来。（有关使用虚拟匿名struct内存输入而不是毯子"memory"内存输入的示例，请参阅Clobbers section in the manual。不幸的是，我不认为这可用于内存块没有编译时常量大小。）

我认为-fno-inline会阻止提升，因为该功能未标有__attribute__((const))或稍弱的__attribute__((pure))，表示没有副作用。内联后，优化器可以看到asm语句。

count0没有针对任何好的方法进行优化因为gcc和clang无法自动向量化循环，其中迭代次数在开始。即他们吮吸strlen或memchr之类的东西，或一般搜索循环，即使他们被告知可以安全地访问内存超过搜索循环提前退出（例如，使用char buf[static 512]作为函数arg）。

您的asm代码的优化：

就像我对这个问题发表评论一样，使用xor reg, 0xFFFF / jnz与cmp reg, 0xFFFF / jnz比较愚蠢，因为cmp / jcc可以宏观融合成比较 - and-branch uop。 cmp reg, mem / jne也可以进行宏融合，因此执行load / xor / branch的标量版本每次比较使用3x uop。（当然，如果没有使用索引寻址模式，Sandybridge只能对负载进行微熔合。而且，SnB只能对每个解码块进行一对宏融合，但你可能会得到第一个cmp / jcc和循环分支到宏融合。）无论如何，xor是一个坏主意。最好只在xor之前tzcnt，因为在循环中保存uops比代码大小或uops总数更重要。

你的标量循环是9个融合域uops，这是每2个时钟在一次迭代中发出的太多。（SnB是一个4宽的管道，对于微小的环路，它实际上可以维持它。）

问题的第一个版本中代码中的缩进，count += __builtin_ctz与if处于同一级别，这让我觉得你在计算不匹配的块，而不仅仅是找到第一个

不幸的是，我为这个答案的第一个版本编写的asm代码并没有解决与OP更新和更清晰的代码相同的问题。请参阅SSE2 asm的这个答案的旧版本，使用pcmpeqb / paddb计算0xFF字节，使用psadbw计算水平和以避免环绕。

使用SSE2（或AVX）获得加速：

对pcmpeq的结果进行分支比cmp上的分支需要更多的uops。如果我们的搜索数组很大，我们可以使用一个循环来一次测试多个向量，然后在断开循环后确定哪个字节有我们的命中。

此优化也适用于AVX2。

这是我的尝试，使用GNU C inline asm和-masm=intel语法。（内在函数可能会提供更好的结果，尤其是在内联时，因为编译器理解内在函数，因此可以通过它们进行常量传播，以及类似的东西.OTOH，如果您了解交易，您通常可以使用手写asm来击败编译器-offs和你定制的微体系结构。另外，如果你可以安全地做出一些假设，但你不能轻易地将它们传达给编译器。）

#include <stdint.h>
#include <immintrin.h>

// compile with -masm=intel
// len must be a multiple of 32  (TODO: cleanup loop)
// buf should be 16B-aligned for best performance
size_t find_first_zero_bit_avx1(const char *bitmap, size_t len) {
    // return size_t not uint64_t.  This same code works in 32bit mode, and in the x32 ABI where pointers are 32bit

    __m128i pattern, vtmp1, vtmp2;
    const char *result_pos;
    int tmpi;

    const char *bitmap_start = bitmap;

    asm (  // modifies the bitmap pointer, but we're inside a wrapper function
      "vpcmpeqw   %[pat], %[pat],%[pat]\n\t"          // all-ones

      ".p2align 4\n\t"   // force 16B loop alignment, for the benefit of CPUs without a loop buffer
      //IACA_START  // See the godbolt link for the macro definition
      ".Lcount_loop%=:\n\t"
//      "  movdqu    %[v1], [ %[p] ]\n\t"
//      "  pcmpeqb   %[v1], %[pat]\n\t"        // for AVX: fold the load into vpcmpeqb, making sure to still use a one-register addressing mode so it can micro-fuse
//      "  movdqu    %[v2], [ %[p] + 16 ]\n\t"
//      "  pcmpeqb   %[v2], %[pat]\n\t"

      "  vpcmpeqb  %[v1], %[pat], [ %[p] ]\n\t"  // Actually use AVX, to get a big speedup over the OP's scalar code on his SnB CPU
      "  vpcmpeqb  %[v2], %[pat], [ %[p] + 16 ]\n\t"

      "  vpand     %[v2], %[v2], %[v1]\n\t"         // combine the two results from this iteration
      "  vpmovmskb  %k[result], %[v2]\n\t"
      "  cmp       %k[result], 0xFFFF\n\t"          // k modifier: eax instead of rax
      "  jne     .Lfound%=\n\t"

      "  add       %[p], 32\n\t"
      "  cmp       %[p], %[endp]\n\t"              // this is only 2 uops after the previous cmp/jcc.  We could re-arrange the loop and put the branches farther apart if needed.  (e.g. start with a vpcmpeqb outside the loop, so each iteration actually sets up for the next)
      "  jb     .Lcount_loop%=\n\t"
      //IACA_END

      // any necessary code for the not-found case, e.g. bitmap = endp
      "  mov     %[result], %[endp]\n\t"
      "  jmp    .Lend%=\n\t"

      ".Lfound%=:\n\t"                       // we have to figure out which vector the first non-match was in, based on v1 and (v2&v1)
                                  // We could just search the bytes over again, but we don't have to.
                                  // we could also check v1 first and branch, instead of checking both and using a branchless check.
      "  xor       %k[result], 0xFFFF\n\t"
      "  tzcnt     %k[result], %k[result]\n\t"  // runs as bsf on older CPUs: same result for non-zero inputs, but different flags.  Faster than bsf on AMD
      "  add       %k[result], 16\n\t"          // result = byte count in case v1 is all-ones.  In that case, v2&v1 = v2

      "  vpmovmskb %k[tmp], %[v1]\n\t"
      "  xor       %k[tmp], 0xFFFF\n\t"
      "  bsf       %k[tmp], %k[tmp]\n\t"        // bsf sets ZF if its *input* was zero.  tzcnt's flag results are based on its output.  For AMD, it would be faster to use more insns (or a branchy strategy) and avoid bsf, but Intel has fast bsf.
      "  cmovnz    %k[result], %k[tmp]\n\t"     // if there was a non-match in v1, use it instead of tzcnt(v2)+16

      "  add       %[result], %[p]\n\t"         // If we needed to force 64bit, we could use %q[p].  But size_t should be 32bit in the x32 ABI, where pointers are 32bit.  This is one advantage to using size_t over uint64_t
      ".Lend%=:\n\t"
      : [result] "=&a" (result_pos),   // force compiler to pic eax/rax to save a couple bytes of code-size from the special cmp eax, imm32  and xor eax,imm32 encodings
        [p] "+&r" (bitmap),
        // throw-away outputs to let the compiler allocate registers.  All early-clobbered so they aren't put in the same reg as an input
        [tmp] "=&r" (tmpi),
        [pat] "=&x" (pattern),
        [v1] "=&x" (vtmp1), [v2] "=&x" (vtmp2)
      : [endp] "r" (bitmap+len)
        // doesn't compile: len isn't a compile-time constant
        // , "m" ( ({ struct { char x[len]; } *dummy = (typeof(dummy))bitmap ; *dummy; }) )  // tell the compiler *which* memory is an input.
      : "memory" // we read from data pointed to by bitmap, but bitmap[0..len] isn't an input, only the pointer.
    );

    return result_pos - bitmap_start;
}

这actually compiles and assembles asm看起来像我的预期，但我没有测试过。请注意，它会将所有寄存器分配留给编译器，因此它更适合内联。即使没有内联，它也不会强制使用必须保存/恢复的调用保留寄存器（例如，使用"b"约束）。

未完成：处理最后一个32B子数据块的标量代码。

基于Agner Fog's guides / tables的Intel SnB系列CPU的静态性能分析。另请参阅x86标记wiki。 我假设我们没有在缓存吞吐量方面遇到瓶颈，所以此分析仅适用于L2缓存中的数据热，或者只有L1缓存足够快。

这个循环可以每2个时钟在一次迭代（两个向量）中发出前端，因为它有7个融合域uop。（前端问题分为4组）。（如果两个cmp / jcc对在同一个块中解码，它实际上可能是8个uops。Haswell以后可以为每个解码组进行两次宏融合，但以前的CPU只能将第一个宏融合。我们可以对循环进行软件管道，以便早期分支离p

所有这些融合域uop都包含一个ALU uop，因此瓶颈将出现在ALU执行端口上。 Haswell添加了第4个ALU单元，可以处理简单的非向量操作，包括分支，因此可以每2个时钟（每个时钟16B）以一次迭代运行此循环。你的i5-2550k（在评论中提到）是一个SnB CPU。

我使用IACA来计算每个端口的uops，因为手动执行它是非常耗时的。 IACA很愚蠢，认为除了循环计数器之外还有某种迭代间的依赖，所以我不得不使用-no_interiteration：

g++ -masm=intel -Wall -Wextra -O3 -mtune=haswell find-first-zero-bit.cpp -c -DIACA_MARKS
iaca -64 -arch IVB -no_interiteration find-first-zero-bit.o

Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - find-first-zero-bit.o
Binary Format - 64Bit
Architecture  - SNB
Analysis Type - Throughput

Throughput Analysis Report
--------------------------
Block Throughput: 2.50 Cycles       Throughput Bottleneck: Port1, Port5

Port Binding In Cycles Per Iteration:
-------------------------------------------------------------------------
|  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |
-------------------------------------------------------------------------
| Cycles | 2.0    0.0  | 2.5  | 1.0    1.0  | 1.0    1.0  | 0.0  | 2.5  |
-------------------------------------------------------------------------

N - port number or number of cycles resource conflict caused delay, DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is expected
! - instruction not supported, was not accounted in Analysis

| Num Of |              Ports pressure in cycles               |    |
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |    |
---------------------------------------------------------------------
|   2^   |           | 1.0 | 1.0   1.0 |           |     |     | CP | vpcmpeqb xmm1, xmm0, xmmword ptr [rdx]
|   2^   |           | 0.6 |           | 1.0   1.0 |     | 0.4 | CP | vpcmpeqb xmm2, xmm0, xmmword ptr [rdx+0x10]
|   1    | 0.9       | 0.1 |           |           |     | 0.1 | CP | vpand xmm2, xmm2, xmm1
|   1    | 1.0       |     |           |           |     |     |    | vpmovmskb eax, xmm2
|   1    |           |     |           |           |     | 1.0 | CP | cmp eax, 0xffff
|   0F   |           |     |           |           |     |     |    | jnz 0x18
|   1    | 0.1       | 0.9 |           |           |     |     | CP | add rdx, 0x20
|   1    |           |     |           |           |     | 1.0 | CP | cmp rdx, rsi
|   0F   |           |     |           |           |     |     |    | jb 0xffffffffffffffe1

在SnB上：pcmpeqb可以在p1 / p5上运行。融合比较和分支只能在p5上运行。非融合cmp可以在p015上运行。无论如何，如果其中一个分支没有宏熔合，则循环可以每8/3 = 2.666个循环运行一次。通过宏观融合，最佳情况是7/3 = 2.333个周期。（IACA并没有尝试模拟uops到端口的分布，就像硬件动态地做出这些决定一样。但是，我们不能期望从硬件中完美调度，因此每2.5个周期可能有2个向量两个宏观融合都发生了。使用port0的Uops有时会窃取port1或port5，从而降低吞吐量。）

正如我之前所说，Haswell更好地处理这个循环。 IACA认为HSW可以在每1.75c的一次迭代中运行循环，但这显然是错误的，因为所采用的循环分支结束了问题组。它将以重复的4,3 uop模式发布。但是执行单元可以处理比这个循环的前端更多的吞吐量，所以它应该能够跟上Haswell / Broadwell / Skylake的前端并且每2个时钟运行一次迭代。

进一步展开更多vpcmpeqb / vpand每个向量只有2个uop（或3个没有AVX，我们将其加载到临时，然后将其用作pcmpeqb的目标。）因此，通过充分展开，我们应该能够每个时钟执行2个向量加载。如果没有AVX，在没有PAND技巧的情况下这是不可能的，因为向量加载/比较/ movmsk /测试和分支是4 uops。更大的展开会更多地解码我们找到匹配的最终位置：基于标量cmp的清理循环可能是一个好主意，一旦我们在该区域。您可以使用相同的标量循环来清理非32B大小的数据。

如果使用SSE，使用movdqu / pcmpeqb xmm,xmm，我们可以使用索引寻址模式，而不会花费我们uop，因为movdqu加载始终是单个加载uop，无论寻址模式。（与商店不同，它不需要与任何东西微熔合）。这允许我们通过使用指向数组末尾的基指针来保存循环开销，并且索引从零开始向上计数。例如<{1}} / add %[idx], 32在索引为负数时循环。

但是，对于AVX，我们可以using a single-register addressing mode节省2 uop，因此js可以微融合。这意味着我们需要在示例中使用的add / cmp / jcc循环结构。这同样适用于AVX2。

Answer 2

所以我觉得我发现了问题。我认为我的内联汇编中使用的寄存器之一，尽管有一个列表，但与g ++使用它们相冲突，并且破坏了测试迭代。我提供g ++版本的代码，作为内联汇编代码，并获得与我自己相同的260000x加速。此外，回想起来，“加速”计算时间是荒谬的。

最后，我非常专注于体现为函数的代码，我没有注意到g ++实际上已经将函数内联（我正在使用-O3优化）函数进入测试for循环。当我强迫g ++不在线（即-fno-inline）时，260000x加速消失了。

我认为g ++未经我的许可就没有考虑内联汇编代码的“clobber list”。

经验教训。我需要在内联汇编约束或使用__attribute__ ((noinline))

阻止函数内联时做得更好编辑：肯定发现g ++正在使用rax作为main（）for循环计数器，这与我对rax的使用相冲突。

为什么其中一个比另一个快得多？

2 个答案:

回答问题标题

您的asm代码的优化：

使用SSE2（或AVX）获得加速：