循环转换以实现数据依赖性和并行化

时间:2018-08-07 18:37:05

标签: c algorithm loops optimization openmp

我有一个嵌套的for循环,用于遍历三维空间(每个维度一个)。嵌套循环形成基于模板的矩阵求解器的一部分,该求解器具有与数据相关的操作。我已经遍历了很多链接/在线材料,深入研究了循环转换的细节,似乎循环倾斜可以为我提供帮助。尽管对于2d网格(由两个循环嵌套组成)来说是相当简单的,但我发现很难扩展到3d。循环看起来像这样。

# pragma omp parallel num_threads(NTt) default(none) private(i,j,k, mythread, dummy) shared(STA,res_sparse_s,COEFF,p_sparse_s, ap_sparse_s,h_sparse_s,RLL, pipi_sparse, normres_sparse, riri_sparse,riri_sparse2,noemer_sparse, nx, ny, nz, nv, PeriodicBoundaryX, PeriodicBoundaryY, PeriodicBoundaryZ)
{

    mythread  = omp_get_thread_num();//0

    // loop 1
    #pragma omp for reduction(+:pipi_sparse)
    for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++)
    {                    
        dummy = COEFF[i][j][k][6] * p_sparse_s[i][j][k];

        if (PeriodicBoundaryX && i == 1)  dummy += COEFF[i][j][k][0] * p_sparse_s[nx ][j][k];
        else                              dummy += COEFF[i][j][k][0] * p_sparse_s[i-1][j][k];

        if (PeriodicBoundaryX && i == nx) dummy += COEFF[i][j][k][1] * p_sparse_s[1  ][j][k];
        else                              dummy += COEFF[i][j][k][1] * p_sparse_s[i+1][j][k];

        if (PeriodicBoundaryY && j == 1)  dummy += COEFF[i][j][k][2] * p_sparse_s[i][ny ][k];
        else                              dummy += COEFF[i][j][k][2] * p_sparse_s[i][j-1][k];

        if (PeriodicBoundaryY && j == ny) dummy += COEFF[i][j][k][3] * p_sparse_s[i][  1][k];
        else                              dummy += COEFF[i][j][k][3] * p_sparse_s[i][j+1][k];

        if (PeriodicBoundaryZ && k == 1)  dummy += COEFF[i][j][k][4] * p_sparse_s[i][j][nz ];
        else                              dummy += COEFF[i][j][k][4] * p_sparse_s[i][j][k-1];


        if (PeriodicBoundaryZ && k == nz) dummy += COEFF[i][j][k][5] * p_sparse_s[i][j][  1];
        else                              dummy += COEFF[i][j][k][5] * p_sparse_s[i][j][k+1];

        ap_sparse_s[i][j][k] = dummy;
        pipi_sparse += p_sparse_s[i][j][k] * ap_sparse_s[i][j][k];
    }


    // loop 2
        // FORWARD
        #pragma omp for schedule(static, nx/NTt)
        for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++)
        {


            dummy = res_sparse_s[i][j][k];

                                           dummy -= COEFF[i][j][k][7] * RLL[i-1][j][k];
            if (PeriodicBoundaryX && i==nx)dummy -= COEFF[i][j][k][8] * RLL[1  ][j][k];


                                            dummy -= COEFF[i][j][k][2] * RLL[i][j-1][k];
            if (PeriodicBoundaryY && j==ny) dummy -= COEFF[i][j][k][3] * RLL[i][1  ][k];


                                            dummy -= COEFF[i][j][k][4] * RLL[i][j][k-1];
            if (PeriodicBoundaryZ && k==nz) dummy -= COEFF[i][j][k][5] * RLL[i][j][1  ];


            RLL[i][j][k] = dummy / h_sparse_s[i][j][k];
        }

    // loop 3
        // BACKWARD
        #pragma omp for schedule(static, nx/NTt)
        for (i=nx; i>=1;i--) for (j=ny; j>=1;j--) for (k=nz; k>=1;k--)
        {
            dummy = RLL[i][j][k]*h_sparse_s[i][j][k];

            if (PeriodicBoundaryX && i==1) dummy -= COEFF[i][j][k][7] * RLL[nx ][j][k];
                                           dummy -= COEFF[i][j][k][8] * RLL[i+1][j][k];

            if (PeriodicBoundaryY && j==1) dummy -= COEFF[i][j][k][2] * RLL[i][ny ][k];
                                           dummy -= COEFF[i][j][k][3] * RLL[i][j+1][k];

            if (PeriodicBoundaryZ && k==1) dummy -= COEFF[i][j][k][4] * RLL[i][j][nz ];
                                           dummy -= COEFF[i][j][k][5] * RLL[i][j][k+1];


            RLL[i][j][k] =  dummy  / h_sparse_s[i][j][k];
        }

}
  • 循环1-> [i][j][k]的值是只读的,[i+1][i-1][j-1][j+1][k-1][k+1]p_sparse_s的数据依赖性
  • 第2圈-> [i][j][k][i-1][j-1][k-1]的数据依赖性
  • 循环3-> [i][j][k][i+1][j+1][k+1]的数据依赖性

编辑

COEFF[i][j][k][NUM]只是为3d空间中的每个点定义的通用系数(一些常数)。由于存在9个与相邻点相对应的系数,因此COEFF[][][][0], COEFF[][][][1] .... COEFF[][][][8]如此。

编辑

在下面的小代码中找到与数据相关的代码。我试图使内部k循环相对于i和j循环倾斜,以便可以对k循环进行矢量化处理。问题是代码在串行运行时给出了绝对正确的答案,并且如果我强制执行并行性或强制执行内循环的矢量化,则会给出一些奇怪的答案。

#include<stdio.h>
#include<stdlib.h>
#include<time.h>
#include<omp.h>

typedef double lr;

#define nx 4
#define ny 4
#define nz 4

void
print3dmatrix(double a[nx+2][ny+2][nz+2])
{
    for(int i=1; i<= nx; i++) {
        for(int j=1; j<= ny; j++) {
            for(int k=1; k<= nz; k++) {
                printf("%f ", a[i][j][k]);
            }
            printf("\n");
        }
        printf("\n");
    }
}

int 
main()
{

    double a[nx+2][ny+2][nz+2];
    double b[nx+2][ny+2][nz+2];

    srand(3461833726);


    // matrix filling 
    // b is just a copy of a
    for(int i=0; i< nx+2; i++) for(int j=0; j< ny+2; j++) for(int k=0; k< nz+2; k++)
    {
        a[i][j][k] = rand() % 5;
        b[i][j][k] = a[i][j][k];
    }

    // loop 1
    //#pragma omp parallel for num_threads(1)
    for(int i=1; i<= nx; i++) for(int j=1; j<= ny; j++) for(int k=1; k<= nz; k++)
    {
        a[i][j][k] = -1*a[i-1][j][k] - 1*a[i][j-1][k] -1 * a[i][j][k-1] + 4 * a[i][j][k];
    }

    print3dmatrix(a);
    printf("******************************\n");

    // loop 2
    //#pragma omp parallel for num_threads(1)
    for(int i=1; i<= nx; i++) 
        for(int j=1; j<= ny; j++)
            // #pragma omp simd
            for(int m=j+1; m<= j+nz; m++)
            {
                b[i][j][m-j] = -1*b[i-1][j][m-j] - 1*b[i][j-1][m-j] -1 * b[i][j][m-j-1] + 4 * b[i][j][m-j];
            }

    print3dmatrix(b);
    printf("=========================\n");

    return 0;
}

请参阅-loop skewing for vectorisation

0 个答案:

没有答案