Question

根据http://igoro.com/archive/gallery-of-processor-cache-effects/，在尝试示例2时，时间应该下降，直到偏移量等于缓存行zie。
但是，在我的机器上，它不起作用代码如下所示。

#define SIZE 1024*1024*64

int main()
{
struct timeval start, end;
int k;
int i;

for(k = 1; k <= 1024; k *= 2)
{

    int *arr = (int*)malloc(SIZE * sizeof(int));
    gettimeofday(&start, NULL);
    for(i = 0; i < SIZE; i += k)
        arr[i] *= 3;
    gettimeofday(&end, NULL);

    printf("K = %d, time = %d\n", k,
            (end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));

    free(arr);
}
return 0;
}

结果如下：

K = 1，时间= 410278
K = 2，时间= 265313
K = 4，时间= 201540
K = 8，时间= 169800
K = 16，时间= 155123
K = 32，时间= 142496
K = 64，时间= 137967
K = 128，时间= 135818
K = 256，时间= 135128
K = 512，时间= 135167
K = 1024，时间= 135462

Answer 1

这取决于编译器（其版本），优化级别和CPU。显然，大部分时间花在malloc上，所以我将其移出循环并增加SIZE。

我正在使用16GB内存的i3770K处理器上使用GCC 4.8.1尝试使用Debian / Sid。

带

#include <stdio.h>
#include <stdlib.h> 
#include <sys/time.h>
#include <time.h>
#define SIZE 1024*1024*1024

int main ()
{
  struct timeval start, end;
  clock_t startcl, endcl;
  int k, i;

  int *arr = (int *) malloc (SIZE * sizeof (int));
  if (!arr) { perror("malloc"); exit(EXIT_FAILURE); };
  for (k = 1; k <= 1024; k *= 2)  {
      gettimeofday (&start, NULL);
      startcl = clock();
      for (i = 0; i < SIZE; i += k)
        arr[i] *= 3;
      gettimeofday (&end, NULL);
      endcl = clock();
      printf ("K = %d, time = %ld, cpu clock=%ld microsec\n", k,
              (end.tv_sec - start.tv_sec) * 1000000 
              + (end.tv_usec - start.tv_usec),
              (long) (endcl - startcl));
    }
  free (arr);
  return 0;
}

并使用gcc -Wall -mtune=native -O3 ./wilsonwen.c -o ./wilsonwen-O3进行编译然后运行它：

K = 1, time = 696074, cpu clock=680000 microsec
K = 2, time = 361173, cpu clock=360000 microsec
K = 4, time = 341920, cpu clock=340000 microsec
K = 8, time = 341767, cpu clock=340000 microsec
K = 16, time = 342065, cpu clock=340000 microsec
K = 32, time = 224502, cpu clock=230000 microsec
K = 64, time = 119544, cpu clock=120000 microsec
K = 128, time = 51089, cpu clock=50000 microsec
K = 256, time = 26447, cpu clock=20000 microsec
K = 512, time = 14104, cpu clock=20000 microsec
K = 1024, time = 8385, cpu clock=10000 microsec

这与您提到的博客更为一致。将malloc移出k外部循环非常重要（如果不这样做，则看不到缓存效果，显然是因为malloc和基础{{1系统调用吃了很多时间。）

我无法解释为什么mmap需要更多时间（也许是因为k=1 - 内存被页面错误带入RAM？）。即使在malloc循环之前添加for (i=0; i<SIZE/1024; i++) arr[i] = i;循环以“预取网页”，for (k的时间仍然几乎是k=1的两倍。我们确实看到了Igor Ostrovsky博客中提到的k=2到k=2的高原。用k=16替换malloc并不是很重要。使用calloc（3.2）而不是clang（4.8）进行编译可以得到非常相似的时序结果。

优化非常重要，尝试使用gcc并运行我没有看到任何平台（即使使用gcc -Wall -O0 ./wilsonwen.c -o ./wilsonwen-O0也会看到）。众所周知，-O1没有任何优化标志会吐出很差的机器代码。

基准测试的一般规则是启用编译器优化。

Answer 2

和我一样。

有人在该条的讨论中得到了相同的结果。他说也许这只是arr [i] * = 3的实际CPU时间。

当K = 1时，需要运行SIZE次。但是当K = 2时，它只需要运行SIZE / 2次。

因此，无论K是什么，您都可以重写代码以运行相同的时间。然后检查它是否相同。打字的时候我就是这么想的。我稍后会试试。如果您尝试过，请添加评论。

这是我的代码：

#include <stdio.h>
#include <time.h>
#include <stdint.h>
#define SIZE 64*1024*1024
int32_t arr [SIZE];
struct timespec ts;

int main(int argc, char *argv[])
{
long i,j= 0;
long start;
long start_sec;
int count = 1;
int k = 0;

// init the arr;
for (i = 0; i< 64*1024*1024;++i){
    arr[i] = 0;
}

for (j = 1; j< 1025;){
    clock_gettime(CLOCK_REALTIME, &ts);
    start = ts.tv_nsec;
    start_sec = ts.tv_sec;
    for (i = 0, k = 0; i< 64*1024*1024; i++, k+=j){
        k = k & (SIZE -1);
        arr[k] *=3;
        arr[k] =1;
    }
    clock_gettime(CLOCK_REALTIME, &ts);
    printf ("%d, %ld, %ld\n", count,(ts.tv_sec-start_sec)*1000000000+(ts.tv_nsec -start), j);
    count++;
    j *= 2;
}
return 0;
}

，输出如下：

1, 352236657, 1
2, 356920027, 2
3, 375986006, 4
4, 494875602, 8
5, 957796009, 16
6, 1397285233, 32
7, 1784398514, 64
8, 1070586859, 128
9, 1130548756, 256
10, 1169113810, 512
11, 1312605482, 1024

如果我注释掉arr-init循环，当K = 1时，它需要比K = 2更多的时间。

我们可以看到，时间会随着期望的增加而增加（在K = 128之前）。因为我们总是有一个64 * 1024 * 1024次的循环，尽管K.K越大，缓存行将刷新的时间越长。

好吧，但我无法解释从K = 64到K = 128的减少。

@Mysticial讨论了lazy-malloc，所以我也在文章中对原始代码进行了实验，但添加了arr-init循环以避免lazy-malloc问题。 K = 1的成本确实降低了，但它仍然大于K = 2的成本，并且比原始版本更接近K = 2的成本* 2。数据如下：

1, 212882204, 1
2, 111660951, 2
3, 67843457, 4
4, 62980310, 8
5, 62092973, 16
6, 42531407, 32
7, 27686909, 64
8, 9142755, 128
9, 4064936, 256
10, 2342842, 512
11, 1130305, 1024

所以我认为从K = 1到K = 2和K = 2到K = 4的原因是从SIZE到SIZE / 2的迭代次数减少。

这是我的想法，但我不确定。

=============================================== =======

我用-Ox编写了代码，减少了消失（但必须添加arr-init循环）。感谢@Basile。我稍后会检查asm代码中的差异。

这是asm代码之间的差异：

没有O1，

    movq    $0, -32(%rbp)
    jmp .L5
.L6:
    movq    -32(%rbp), %rax
    movl    arr(,%rax,4), %edx
    movl    %edx, %eax
    addl    %eax, %eax
    addl    %eax, %edx
    movq    -32(%rbp), %rax
    movl    %edx, arr(,%rax,4)
    movq    -24(%rbp), %rax
    addq    %rax, -32(%rbp)
.L5:
    cmpq    $67108863, -32(%rbp)
    jle .L6

用O1，

    movl    $0, %eax 
.L3:
    movl    arr(,%rax,4), %ecx # ecx = a[i]
    leal    (%rcx,%rcx,2), %edx # edx = 3* rcx
    movl    %edx, arr(,%rax,4) # a[i] = edx
    addq    %rbx, %rax # rax += rbx
    cmpq    $67108863, %rax 
    jle .L3

我将没有O1的asm代码更改为

    movq    $0, -32(%rbp) 
    movl    $0, %eax
    movq    -24(%rbp), %rbx
    jmp .L5
.L6:
    movl    arr(,%rax,4), %edx 
    movl    %edx, %ecx 
    addl    %ecx, %ecx 
    addl    %ecx, %edx 
    movl    %edx, arr(,%rax,4) 
    addq    %rbx, %rax
.L5:
    cmpq    $67108863, %rax 
    jle .L6

然后我得到了结果：

1, 64119476, 1
2, 63417463, 2
3, 63732534, 4
4, 66703562, 8
5, 65740635, 16
6, 47743618, 32
7, 28402013, 64
8, 9444894, 128
9, 4544371, 256
10, 2991025, 512
11, 1242882, 1024

它几乎与O1相同。似乎movq -32（％rbp），％rax的东西花费太多了。但我不知道为什么。

也许我最好问一个关于它的新问题。

为什么缓存不能正常工作？

2 个答案: