为什么OMP任务比OMP运行慢?

时间:2018-01-06 17:16:10

标签: c openmp

我是OPENMP的新手,想用它来解决波动方程,序列代码在这里:

#include <time.h>
#include <stdio.h>
#include <omp.h>
#include <math.h>

#define GRID_SZ 3000
#define ARR_SZ GRID_SZ * GRID_SZ
#define PEAK_SZ 31

double *process_withoutomp() {
    double start = omp_get_wtime();
    int i, j;
    double dt = 0.04, C = 16, K = 0.1, h = 6;
    double *data, *olddata, *newdata, *tmp;
    double x[PEAK_SZ][PEAK_SZ], linspace[PEAK_SZ], delta = 2.0/(PEAK_SZ-1.0);
    data = (double*)malloc(sizeof(double)*ARR_SZ);
    olddata = (double*)malloc(sizeof(double)*ARR_SZ);
    newdata = (double*)malloc(sizeof(double)*ARR_SZ);

    for(i = 0; i < ARR_SZ; i++){
            data[i] = 1.0;
    }

    for(i = 0; i < PEAK_SZ; i++){
            linspace[i] = -1.0 + delta * i;
    }

    for(i = 0; i < PEAK_SZ; i++){
            for(j = 0; j < PEAK_SZ; j++){
                    x[i][j] = linspace[i];
            }
    }

    for(i = 0; i < PEAK_SZ; i++){
            for(j = 0; j < PEAK_SZ; j++){
                    data[(i+20)*GRID_SZ+j+20] += h * exp( -5 * (pow(x[i][j], 2 ) + pow(x[j][i], 2 )));
            }
    }

    for(i = 0; i < ARR_SZ; i++){
            olddata[i] = data[i];
    }

    for(i = 0; i < 20; i++){
            sequential_update_withoutomp( data, olddata, newdata, C, K, dt);
            tmp = olddata;
            olddata = data;
            data = newdata;
            newdata = tmp;
    }
    double end = omp_get_wtime();
    printf("without omp spend: %f\n",end-start);

    return data;}void sequential_update_withoutomp(double *data, double *olddata, double *newdata, double C, double K, double dt ){
    int i, j, add_i, sub_i, add_j, sub_j;
    double pot;
    for( i = 0; i < GRID_SZ; i++){
            for( j = 0; j < GRID_SZ; j++){
                    add_i = i+1 >= GRID_SZ ? i : i+1;
                    add_j = j+1 >= GRID_SZ ? j : j+1;
                    sub_i = i-1 < 0 ? 0 : i-1;
                    sub_j = j-1 < 0 ? 0 : j-1;
                    pot = data[add_i*GRID_SZ+j]+
                                data[sub_i*GRID_SZ+j]+
                                data[add_j+i*GRID_SZ]+
                                data[sub_j+i*GRID_SZ]-
                                4*data[i*GRID_SZ+j];
                    newdata[i * GRID_SZ + j] = 
                            ( pow(C * dt, 2) * pot * 2 + 4 * data[i * GRID_SZ + j] - olddata[i * GRID_SZ + j] *(2 - K * dt) ) / (2 + K * dt);
            }
    }}

以下是用于:

的版本
double *process_withomp() {
    double start = omp_get_wtime();

    int i, j;
    double dt = 0.04, C = 16, K = 0.1, h = 6;
    double *data, *olddata, *newdata, *tmp;
    double x[PEAK_SZ][PEAK_SZ], linspace[PEAK_SZ], delta = 2.0/(PEAK_SZ-1.0);
    data = (double*)malloc(sizeof(double)*ARR_SZ);
    olddata = (double*)malloc(sizeof(double)*ARR_SZ);
    newdata = (double*)malloc(sizeof(double)*ARR_SZ);

    #pragma omp parallel for private(i) schedule(auto)
    for(i = 0; i < ARR_SZ; i++){
            data[i] = 1.0;
    }

    #pragma omp parallel for private(i,j) schedule(auto)
    for(i = 0; i < PEAK_SZ; i++){
        linspace[i] = -1.0 + delta * i;
        for(j = 0; j < PEAK_SZ; j++) {
            x[i][j] = linspace[i];
        }
    }

    #pragma omp barrier

    #pragma omp parallel for private(i,j) schedule(auto)
    for(i = 0; i < PEAK_SZ; i++){
        for(j = 0; j < PEAK_SZ; j++){
            data[(i+20)*GRID_SZ+j+20] += h * exp( -5 * (pow(x[i][j], 2 ) + pow(x[j][i], 2 )));
        }
    }

    #pragma omp barrier

    #pragma omp parallel for private(i) schedule(auto)
    for(i = 0; i < ARR_SZ; i++){
        olddata[i] = data[i];
    }

    #pragma omp barrier

    for(i = 0; i < 20; i++) {
        sequential_update_withomp( data, olddata, newdata, C, K, dt);
        tmp = olddata;
        olddata = data;
        data = newdata;
        newdata = tmp;
    }


    double end = omp_get_wtime();
    printf("with omp spend: %f\n",end-start);
    return data;}void sequential_update_withomp(double *data, double *olddata, double *newdata, double C, double K, double dt ) {
    int i, j;
    double pot;
    #pragma omp parallel for private(i,j,pot) schedule(auto)
    for( i = 0; i < GRID_SZ; i++) {
        for( j = 0; j < GRID_SZ; j++) {
            pot = data[(i+1 >= GRID_SZ ? i : i+1)*GRID_SZ+j]+
                data[(i-1 < 0 ? 0 : i-1)*GRID_SZ+j]+
                data[(j+1 >= GRID_SZ ? j : j+1)+i*GRID_SZ]+
                data[(j-1 < 0 ? 0 : j-1)+i*GRID_SZ]
                -4*data[i*GRID_SZ+j];
            newdata[i * GRID_SZ + j] = 
                        (pow(C * dt, 2) * pot * 2 + 4 * data[i * GRID_SZ + j] - olddata[i * GRID_SZ + j] 
                        * (2 - K * dt)) 
                        / (2 + K * dt);
        }
    }}

这个版本运行良好但是当我尝试使用任务来替换它时,结果是正确的,但是时间花费更多:

double *process_withomp1() {
    double start = omp_get_wtime();

    int i, j;
    double dt = 0.04, C = 16, K = 0.1, h = 6;
    double *data, *olddata, *newdata, *tmp;
    double x[PEAK_SZ][PEAK_SZ], linspace[PEAK_SZ], delta = 2.0/(PEAK_SZ-1.0);
    data = (double*)malloc(sizeof(double)*ARR_SZ);
    olddata = (double*)malloc(sizeof(double)*ARR_SZ);
    newdata = (double*)malloc(sizeof(double)*ARR_SZ);

    #pragma omp parallel for private(i) schedule(auto)
    for(i = 0; i < ARR_SZ; i++){
            data[i] = 1.0;
    }

    #pragma omp parallel for private(i,j) schedule(auto)
    for(i = 0; i < PEAK_SZ; i++){
        linspace[i] = -1.0 + delta * i;
        for(j = 0; j < PEAK_SZ; j++) {
            x[i][j] = linspace[i];
        }
    }

    #pragma omp barrier

    #pragma omp parallel for private(i,j) schedule(auto)
    for(i = 0; i < PEAK_SZ; i++){
        for(j = 0; j < PEAK_SZ; j++){
            data[(i+20)*GRID_SZ+j+20] += h * exp( -5 * (pow(x[i][j], 2 ) + pow(x[j][i], 2 )));
        }
    }

    #pragma omp barrier

    #pragma omp parallel for private(i) schedule(auto)
    for(i = 0; i < ARR_SZ; i++){
        olddata[i] = data[i];
    }

    #pragma omp barrier

    for(i = 0; i < 20; i++) {
        sequential_update_withomp1( data, olddata, newdata, C, K, dt);
        tmp = olddata;
        olddata = data;
        data = newdata;
        newdata = tmp;
    }


    double end = omp_get_wtime();
    printf("with omp spend: %f\n",end-start);
    return data;}
void sequential_update_withomp1(double *data, double *olddata, double *newdata, double C, double K, double dt ) {
    int i, j;
    double pot;
    #pragma omp parallel private(i,j,pot)
    for( i = 0; i < GRID_SZ; i++) {
        for( j = 0; j < GRID_SZ; j++) {
            #pragma omp task
            {
            pot = data[(i+1 >= GRID_SZ ? i : i+1)*GRID_SZ+j]+
                    data[(i-1 < 0 ? 0 : i-1)*GRID_SZ+j]+
                    data[(j+1 >= GRID_SZ ? j : j+1)+i*GRID_SZ]+
                    data[(j-1 < 0 ? 0 : j-1)+i*GRID_SZ]
                    -4*data[i*GRID_SZ+j];
            newdata[i * GRID_SZ + j] = 
                            (pow(C * dt, 2) * pot * 2 + 4 * data[i * GRID_SZ + j] - olddata[i * GRID_SZ + j] 
                            * (2 - K * dt)) 
                            / (2 + K * dt);
            }
        }
    }}

在我的Mac中,串行版本需要大约7.7秒,版本需要3.7秒,但任务使用53秒。

任何人都知道这里出了什么问题?

提前致谢

1 个答案:

答案 0 :(得分:2)

这里有两件事需要考虑:

  • a)主题粒度,即每个主题的工作量
  • b)创建任务
  • 的方式

在你的代码中,a)太小了,b)坏了。

<强> A) 在您的task示例中,内循环的一次迭代是一项任务,而在parallel for示例中, n外部循环的迭代是并行化,即每个线程处理外部循环的一大块迭代。使用schedule(static, 1),一个外部迭代将是每个线程的工作大小。请记住,所有并行性都会增加开销,用于同步内容,记账等。这增加了成本,必须通过提高并行执行的执行速度来补偿。找到合适的工作量是至关重要的,你需要尽可能多的工作来保持一切忙碌,也许还需要更多的时间来为调度程序提供一些空间来弥补任务/块之间的负载不平衡,但尽可能少地保持开销很小。

<强> b)中 在并行区域中运行循环意味着每个线程都在运行整个循环嵌套并多次创建所有任务。这就像并行多次运行串行程序一样。

void sequential_update_withomp1(double *data, double *olddata, double *newdata, double C, double K, double dt ) {
// ....
#pragma omp parallel private(i,j,pot)
{
    // split loop among threads of parallel region
    // i.e. create tasks in parallel
    #pragma omp for
    for( i = 0; i < GRID_SZ; i++) {
        // coarse grained tasks (as in parallel for version)
        #pragma omp task
        {
            // each inner for loop is one task
            for( j = 0; j < GRID_SZ; j++) {
                // ...
            }
        } // task
    } // parallel for
} // parallel region

这给了我(2核x 2超线程):

serial:        4.839213
parallel for:  2.529813
task:          2.817615

注意:此处实际使用任务没有任何意义,因为它们只会在并行for循环之上增加开销。