Question

测量我的应用程序（使用OpenMP编写的多线程）使用的带宽的最简单，最有效的方法是什么？我跑了STREAM以获得最大值。可持续的带宽，我现在想知道我是否正在使整个可用带宽饱和。

我发现了一些相关问题（例如Main memory bandwidth measurement），但我找不到这个问题的答案;

可悲的是，我无法使用VTune，但我可以使用PAPI计数器;

我的主要目标是找出应用程序的可扩展性是否与内存带宽的饱和度相关联。

由于

Answer 1

有很多方法可以获得（从命令行）整个应用程序的带宽，但听起来有很多内核需要单独查看。在这种情况下，使用PAPI调用包装部分代码是一种非常明智的方法。

您可以在系统上使用PAPI事件计数器（papi_avail）来查找加载/存储指令的总数，如果您知道加载/存储的大小，则可以获得内存带宽。或者，您可以计算缓存中的命中数，并乘以行大小，以推断跨系统传输的实际数据量。 PAPI维基上的各个地方都有文档，例如： here用于高级界面，here's是一些有用的派生数量公式。

这是一个编码的简单示例，以明智的方式和缓存不友好的转置方式进行矩阵向量乘法。请注意，调用PAPI_read_counters会重置计数器，这就是我们想要的。

#include <stdio.h>
#include <stdlib.h>
typedef char * caddr_t;
#include <papi.h>
#include <sys/time.h>

int init(float ***a, float **x, float **y, int size);
void report_results(char *tname, long_long *values, const int n, double wtime);
void sensible_matvec(float **a, float *x, float *y, int size);
void wrong_order_matvec(float **a, float *x, float *y, int size);
void tick(struct timeval *t);
double tock(struct timeval *t);

#define NUM_EVENTS 3
int main(int argc, char **argv) {
    const int matsize = 4096;

    float **a, *x, *y;
    init(&a, &x, &y, matsize);

    int events[NUM_EVENTS] = {PAPI_L1_DCM, PAPI_LST_INS, PAPI_FP_INS};
    long_long values[NUM_EVENTS];

    double walltime;
    struct timeval t;

    if (PAPI_start_counters(events, NUM_EVENTS) != PAPI_OK) {
       fprintf(stderr, "Error starting PAPI counters; aborting\n");
       exit(1);
    }

    tick(&t);
    sensible_matvec(a, x, y, matsize);
    PAPI_read_counters(values, NUM_EVENTS);
    walltime = tock(&t);

    report_results("Sensible", values, NUM_EVENTS, walltime);

    tick(&t);
    wrong_order_matvec(a, x, y, matsize);
    PAPI_stop_counters(values, NUM_EVENTS);
    walltime = tock(&t);

    report_results("Wrong order", values, NUM_EVENTS, walltime);

    return 0;
}

void report_results(char *tname, long_long *values, const int n, double wtime) {
    long_long total_mem = values[1];
    long_long total_flops = values[2];
    long_long l1misses = values[0];
    printf("Test %s: time elapsed = %f, memory accesses = %lld, flop = %lld\n",
            tname, wtime, total_mem, total_flops);
    printf("\tMemory bandwidth (MB/sec) = %f\n", 1.0*total_mem*sizeof(float)/(wtime*1024*1024));
    printf("\tL1 cache miss rate = %f\n", 1.0*l1misses/total_mem);
    printf("\tMFLOPS = %lf\n\n", 1.0*total_flops/(wtime*1024*1024));
}

int alloc2d(float ***a, int n);
int free2d(float ***a, int n);
int alloc1d(float **x, int n);
int free1d(float **x, int n);

int init(float ***a, float **x, float **y, int size) {
    if (alloc2d(a,size))
        return -2;

    if (alloc1d(x,size)) {
        free2d(a,size);
        return -2;
    }

    if (alloc1d(y,size)) {
        free2d(a,size);
        free1d(x,size);
        return -3;
    }

    for (int i=0; i<size; i++) {
            (*x)[i] = (float)i;
            (*y)[i] = 0.;
    }

    for (int i=0; i<size; i++) {
        for (int j=0; j<size; j++) {
            (*a)[i][j] = i;
        }
    }

    return 0;
}
void sensible_matvec(float **a, float *x, float *y, int size) {
    for (int i=0; i<size; i++) {
        for (int j=0; j<size; j++) {
            y[i] += a[i][j]*x[j];
        }
    }
}

void wrong_order_matvec(float **a, float *x, float *y, int size) {
    for (int j=0; j<size; j++) {
        for (int i=0; i<size; i++) {
            y[i] += a[i][j]*x[j];
        }
    }
}

void tick(struct timeval *t) {
    gettimeofday(t, NULL);
}


double tock(struct timeval *t) {
    struct timeval now;
    gettimeofday(&now, NULL);
    return (double)(now.tv_sec - t->tv_sec) + ((double)(now.tv_usec - t->tv_usec)/1000000.);

}


void freeall(float ***a, float **x, float **y, int size) {
    free2d(a, size);
    free1d(x, size);
    free1d(y, size);
    return;
}

int alloc2d(float ***a, int n) {
    float *data = (float *)malloc(n*n*sizeof(float));
    if (data == NULL) return -1;

    *a = (float **)malloc(n*sizeof(float *));
    if (*a == NULL) {free(data); return -1;};

    for (int i=0; i<n; i++)
        (*a)[i] = &(data[i*n]);

    return 0;
}
int free2d(float ***a, int n) {
    free (&((*a)[0][0]));
    free(*a);

    return 0;
}


int alloc1d(float **a, int n) {
    *a = (float *)malloc(n*sizeof(float));
    if (*a == NULL) return -1;

    return 0;
}

int free1d(float **a, int n) {
    free(*a);

    return 0;
}

跑步给出：

$ gcc -o papi-test papi-test.c -I${PAPI_INC_DIR} -L${PAPI_LIB_DIR} -lpapi -Wall -std=c99
$ ./papi-test
Test Sensible: time elapsed = 0.121877, memory accesses = 302020775, flop = 33580481
    Memory bandwidth (MB/sec) = 9453.119330
    L1 cache miss rate = 0.003921
    MFLOPS = 262.763624

Test Wrong order: time elapsed = 0.537639, memory accesses = 302026751, flop = 39629352
    Memory bandwidth (MB/sec) = 2142.963254
    L1 cache miss rate = 0.094045
    MFLOPS = 70.295301

Answer 2

要测量应用程序的带宽，您需要知道正在读取和/或写入多少内存，让我们调用分子，并且您需要知道读取和/或写入它需要多长时间，让称之为分母。带宽是分子/分母。

如果您的应用程序很复杂，那么计算读取和/或写入的内存可能并不那么容易。此外，如果您的应用程序正在执行许多其他操作，则可能不容易计算时间。您必须减去其他操作的时间。因此，在测量最大吞吐量时，通常会使用简单的算法。

如果您想选择一个比喻算法来尝试与您的应用程序进行比较，那么您应该看看您的应用程序是仅写入数据，只读取数据，还是读取和写入。

如果您只是在写数据，可以使用写（memset）测试：

#pragam omp parallel for
for(int i=0; i<n; i++) {
    x[i] = k;
}

如果你同时阅读和写入数据，你可以进行简单的复制（memcpy）测试

#pragma omp parallel for
for(int i=0; i<n; i++) {
    y[i] = x[i];
}

事实上，如果你看一下STREAM源代码，它基本上就是它对复制测试的作用。

如果您只是阅读数据，可以像这样进行缩减（如果要进行矢量化，请确保使用-ffast-math进行编译）：

#pragma omp parallel for reduction(+:sum)
for(int i=0; i<n; i++) {
    sum += x[i]*y[i];
}

STREAM的测试都是读写测试。我编写了自己的带宽工具，它只进行写入，读取和写入，只读取。

不幸的是，写入数据的测试不会接近峰值带宽。原因是为了写入数据，他们必须首先将数据读入缓存。 这就是STREAM无法接近系统峰值带宽的原因。为了在写入时获得峰值带宽，您需要执行non-temporal stores只能写入数据没有先将其读入缓存。

例如，对于SSE并假设x和y是浮点数组，您可以像这样进行读写测试：

#pragma omp parallel for    
for(int i=0; i<n/4; i++) {
    __m128 v = _mm_load_ps(&x[4*i]);
    _mm_stream_ps(&y[4*i], v);
}

如果你看一下Agner Fog的asmlib，你会发现这正是他对大型数组memset和memcpy的所作所为。实际上他的asmlib和那个例子我只给了get 85% (45 GB/s out of 51 GB/s) of the bandwidth on my system而the STREAM tests only get about 45% of the bandwidth。

这些测试假设您的算法是内存绑定的，并且比较您读取比最慢的缓存大得多的数组。如果您的算法重用仍然在缓存中的数据，那么由于承载循环依赖性，读取测试不会接近峰值带宽。要解决此问题，您必须根据操作和硬件展开3-10次。此外，如果您正在为要重复使用的缓存中的数组执行写操作，那么您不希望进行非临时存储。这就是为什么Agner Fog的asmlib仅对大型数组使用非临时存储。

测量多线程应用程序的带宽

2 个答案: