我正在尝试调查访问数组元素和缓存未命中之间的关系。我写了以下代码。
#include <asm/unistd.h>
#include <linux/perf_event.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <inttypes.h>
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
int cpu, int group_fd, unsigned long flags) {
int ret;
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
group_fd, flags);
return ret;
}
int main(int argc, char **argv) {
struct perf_event_attr pe;
long long count;
int fd;
char *chars, c;
uint64_t n = 100000000;
int step = 64;
if (argc > 1) {
step = atoi(argv[1]);
}
chars = malloc(n * sizeof(char));
memset(&pe, 0, sizeof(struct perf_event_attr));
pe.type = PERF_TYPE_HW_CACHE;
pe.size = sizeof(struct perf_event_attr);
pe.config = PERF_COUNT_HW_CACHE_L1D |
PERF_COUNT_HW_CACHE_OP_READ << 8 |
PERF_COUNT_HW_CACHE_RESULT_MISS << 16;
pe.disabled = 1;
pe.exclude_kernel = 1;
pe.exclude_hv = 1;
fd = perf_event_open(&pe, 0, -1, -1, 0);
if (fd == -1) {
fprintf(stderr, "Error opening leader %llx\n", pe.config);
exit(EXIT_FAILURE);
}
for (size_t i = 0; i < n; i++) {
chars[i] = 1;
}
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
// Read from memory
for (size_t i = 0; i < n; i += step) {
c = chars[i];
}
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
read(fd, &count, sizeof(long long));
printf("%lld\n", count);
close(fd);
free(chars);
}
很容易知道,只有数组chars中的n/step元素被访问并赋值给c。如果步长大于缓存行(通常为 64),我认为缓存未命中的数量应该是 n/step。 step小的时候没有问题,即打印的count数约为n/step。然而,如果 step 是一个很大的数字,例如 1000000,count 大约等于 2n/step。这让我困惑了很长时间。谁能解释一下这个奇怪的结果?