Question

我正在尝试学习 OpenMP，但是代码运行速度比不使用 openMP 慢。有很多关于这个的帖子，但似乎没有一个适用于我的问题。我创建了一个简单的程序来说明在运行它时使用“omp parallel for”的意义，我得到了以下性能。

   No OMP 0.0109663sec
   Parallel for: 0.0076869sec single thread
   Parallel for: 0.0151231sec 2 threads
   Parallel for: 0.0169528sec 4 threads
   Parallel for: 0.0150955sec 8 threads

使用 2 到 8 个内核的性能大约是不使用 openMP 的一半。显然，这不是我的预期。

我使用的是visual studio express 2015。无论您是在优化器打开还是关闭的情况下运行它都没有关系。我已经在 c 编译器命令行中设置了 /openmp。我相信我已经正确设置了共享和私人条款。我正在初始化一个包含 1,000,000 个条目的数组，因此设置并行线程的任何初始开销都不是问题。我有一个英特尔 i7，有 8 个内核

代码：我有两个功能 testparrallelfor 和 testnoomp()。函数的命名应该是不言自明的。语句 ++th[omp_get_thread_num()];只是计算每个线程获得多少循环计数。即使我将该声明注释掉，结果也是一样的。我也尝试过使用静态变量 double a[1000*1000] 来查看问题是否与变量 a 的动态堆分配有关。

#include <omp.h>

static int th[8];

void reset_th()
{
    int i;
    for (i = 0; i < 8; ++i)
        th[i] = -1;
}

void out_th()
{
    int i;
    cout << "Threads ";
    for (i = 0; i < 8; ++i)
        cout << i << ":" << th[i] + 1 << ", ";
    cout << endl;
}

void testparallelfor(int len, int no)
    {
    const int n = 1000 * 1000;
    double tw;
    double *a = new double[n];

    reset_th();
    tw = omp_get_wtime();
#pragma omp parallel shared(a, len, th) num_threads(no) if (len > 1000)
    {
#pragma omp for 
        for (int la = 0; la < len; ++la)
        {
            ++th[omp_get_thread_num()];
            a[la] = la * 2 + 1; 
        }
    }

    tw = omp_get_wtime() - tw;
    cout << "Parallel for " << tw << "sec" << endl;
    out_th();
    }

void testnoomp(int len)
{
    int n = 1000 * 1000;
    double tw;
    double *a = new double[n];

    reset_th();
    tw = omp_get_wtime();
    for (int la = 0; la < len; ++la)
        {
        ++th[omp_get_thread_num()];
        a[la] = la * 2 + 1; 
        }

    tw = omp_get_wtime() - tw;
    cout << "No OMP " << tw << "sec" << endl;
    out_th();
}

int main()
    {
    int n = 1000*1000;

    testnoomp(n);               // no OpenMP 
    for(int i=1; i<=8; i*=2)
        testparallelfor(n, i);   // is is the number of threads to be sued 

    cout << endl;
    return 0;
    }

任何帮助或见解将不胜感激。

Answer 1

乍一看，一些笔记/观察：

从多个线程更新 int th[8] 将导致 false sharing。这可能会导致 x86 架构的大幅放缓。将值至少扩展 64 个字节。
与并行化开销相比，工作负载太短。特别是在第一次运行时，OpenMP 会懒惰地启动池中的额外线程。
工作负载太简单，主要是内存限制。内存带宽是有限的并在所有线程之间共享。随着工作集的增加并且不再适合缓存，并行化它会导致由于缓存抖动而进一步减慢。
一个好的优化编译器可能会优化掉循环和/或计算，因为它们从未被使用过（幸运的是 MSVC 在这种情况下没有使用）。
注意实际核心数与超线程处理器的数量。超线程不会使 CPU 容量翻倍。

如果我补偿错误共享、增加工作负载复杂性（在其中抛出 sqrt）和大小（迭代 50 倍），那么我确实会看到性能有所提高。

测试结果（在 MSVC 2019 x64 /O2 /fp:fast 上测试）：

No OMP 0.413487sec
1 Parallel for 0.440291sec
2 Parallel for 0.217796sec
4 Parallel for 0.108129sec
8 Parallel for 0.0959285sec

在 8 个线程时，加速变得可以忽略不计。那是因为我的系统 (i7-7700) 有 4 个内核，并且由于每个内核只有一个 FP 执行单元，因此超线程对 sqrt 之类的操作没有帮助。

我调整后的版本：

#include <cmath>
#include <iostream>
#include <omp.h>
using namespace std;

struct tsint {
  alignas(64) int x; // spread elements by 64 bytes
};
static tsint th[8];

static constexpr int n = 1000 * 1000;

void reset_th() {
  int i;
  for (i = 0; i < 8; ++i)
    th[i].x = -1;
}

void out_th() {
  int i;
  cout << "Threads ";
  for (i = 0; i < 8; ++i)
    cout << i << ":" << th[i].x  + 1 << ", ";
  cout << endl;
}

void testparallelfor(int m, int no) {
  double tw;
  double* a = new double[n]();

  reset_th();
  tw = omp_get_wtime();
#pragma omp parallel num_threads(no) // no need to specify shared vars, shared is the default
  {
#pragma omp for
    for (int la = 0; la < n*m; ++la) {
      ++th[omp_get_thread_num()].x;
      a[la % n] = sqrt(la * 2.1) + 1;       // heavier math
    }
  }

  tw = omp_get_wtime() - tw;
  cout << no << " Parallel for " << tw << "sec" << endl;
  out_th();
}

void testnoomp(int m) {
  double tw;
  double* a = new double[n]();

  reset_th();
  tw = omp_get_wtime();
  for (int la = 0; la < n*m; ++la) {
    ++th[omp_get_thread_num()].x;
    a[la % n] = sqrt(la * 2.1) + 1;         // heavier math
  }

  tw = omp_get_wtime() - tw;
  cout << "No OMP " << tw << "sec" << endl;
  out_th();
}

int main(int argc, char** argv) {
  int m = argc + 49; // m is number of iterations (50, but compiler doesn't know)
  testnoomp(m); // no OpenMP

  for (int i = 1; i <= 8; i *= 2)
    testparallelfor(m, i); // i is the number of threads to be issued

  // repeat the test again now that thread pool is hot
  for (int i = 1; i <= 8; i *= 2)
    testparallelfor(m, i); // i is the number of threads to be issued

  cout << endl;
  return 0;
}

最后说明：

尝试使用 /arch:AVX2 进行编译以进行自动矢量化，速度应该会提高一倍（但对于并行版本，内存带宽将成为更大的问题）。

Answer 2

使用共享数组 th 是减慢任何多线程代码的好方法。数据在 64 字节长的缓存线中共享。如果两个内核在同一个缓存行上工作，即使它们在缓存行的不同部分工作，也会减慢它们的运行速度。这种现象被称为“虚假共享”。
我对omp知之甚少。我通常不太信任任何生成任何多线程的内置编译。例如，如果您的大多数线程同时推进，那么它们都会修改同一个缓存行上的数据 - 这不是很有效率。编译器是否意识到这一点？
您的代码并没有真正进行任何计算。它主要存储数据。大多数现代系统的内存总线有限。单个内核可以完全或几乎完全超载它。您可能需要服务器级处理器或其他东西才能拥有更好的内存总线。

OpenMP 运行速度比单线程慢

2 个答案: