我的代码在以下循环中花费了相当多的时间。我使用gcc-4.8和-O3 -march=native
来编译代码。由于我是优化的绝对新手,我怎么知道编译器是否做到了所有这一切?我正在使用AMD FX(tm)-6200
float* __restrict__ ApsiPtr = Apsi.begin();
const float* const __restrict__ psiPtr = psi.begin();
const float* const __restrict__ diagPtr = diag().begin();
register const label nCells = diag().size();
for (register label cell=0; cell<nCells; cell++) {
ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
}
ddd转储以下汇编代码。
Dump of assembler code from 0x7ffff1d32010 to 0x7ffff1d32110:¬
=> mov (%rsp),%rdi¬
callq 0x7ffff1ba5ca0 <_ZNK4Foam9lduMatrix4diagEv@plt>¬
mov 0x8(%rax),%edx¬
test %edx,%edx¬
mov %edx,%esi¬
mov %edx,0x34(%rsp)¬
jle 0x7ffff1d3251f ¬
mov 0x38(%rsp),%r10¬
lea 0x10(%rbx),%r8¬
lea 0x10(%rbp),%rax¬
lea 0x10(%r10),%rdi¬
cmp %rdi,%rbx¬
setae %r9b¬
cmp %r8,%r10¬
setae %r11b¬
or %r11d,%r9d¬
cmp %rax,%rbx¬
setae %dl¬
cmp %r8,%rbp¬
setae %cl¬
or %ecx,%edx¬
test %dl,%r9b¬
je 0x7ffff1d32728 ¬
cmp $0xc,%esi¬
jbe 0x7ffff1d32728 ¬
shr %esi¬
lea 0x40(%r10),%rax¬
mov %rbx,%rdx¬
lea -0x5(%rsi),%r8d¬
lea (%rsi,%rsi,1),%r9d¬
mov %esi,0x38(%rsp)¬
shr $0x2,%r8d¬
mov %r9d,0x4c(%rsp)¬
mov %rbp,%rsi¬
shl $0x6,%r8¬
mov $0x0,%r9d¬
lea 0x80(%r10,%r8,1),%r11¬
mov %r11,%rcx¬
sub %rax,%rcx¬
and $0x40,%ecx¬
movupd -0x40(%rax),%xmm0¬
movupd 0x0(%rbp),%xmm1¬
prefetcht0 0x1e0(%r10)¬
prefetcht0 0x1e0(%rbp)¬
prefetchw 0x1e0(%rbx)¬
mov $0x4,%r9d¬
mulpd %xmm1,%xmm0¬
lea 0x40(%rbp),%rsi¬
lea 0x40(%rbx),%rdx¬
mov %r10,0x40(%rsp)¬
movlpd %xmm0,(%rbx)¬
movhpd %xmm0,0x8(%rbx)¬
movupd -0x30(%rax),%xmm2¬
movupd 0x10(%rbp),%xmm3¬
mulpd %xmm3,%xmm2¬
movlpd %xmm2,0x10(%rbx)¬
movhpd %xmm2,0x18(%rbx)¬
movupd -0x20(%rax),%xmm4¬
movupd 0x20(%rbp),%xmm5¬
End of assembler dump.¬