内存增加与fortran& MKL并行

时间:2015-11-29 03:11:38

标签: fortran intel-mkl

我在Fortran 90中为矩阵产品状态编写了一个程序。在这个程序中,我使用的是英特尔MKL库。当编译器选项为:

OPTIMIZE= -parallel -par-threshold90 -ipo -O3 -no-prec-div -fp-model fast=2 -xHost

LinkLine = $(OPTIMIZE) $(DIAGNOSE) -mkl -static-intel,

在运行时,内存不断增加。 如果我将编译器选项更改为:

OPTIMIZE= -ipo -O3 -no-prec-div -fp-model fast=2 -xHost

LinkLine = $(OPTIMIZE) $(DIAGNOSE) -mkl=sequential -static-intel,

没有问题,一切都很好。 为什么会这样?这有什么解决方案吗?

按照Alexander的建议,我在下面发布了一个子程序(对矩阵进行奇异值分解):

SUBROUTINE SVD(Theta,S1,S2,lambda,Din,Dout)
IMPLICIT NONE

INTEGER, INTENT(IN) :: Din, Dout
COMPLEX(kind=rKind),DIMENSION(Din,Din), INTENT(IN) :: Theta

REAL(kind=rKind), INTENT(OUT) :: lambda(Dout)
COMPLEX(kind=rKind), INTENT(OUT) :: S1(Din,Dout),S2(Din,Dout)

COMPLEX(kind=rKind) :: M(Din,Din)
REAL(kind=rKind) :: temp_lam(Din)
COMPLEX(kind=rKind) :: temp_U(Din,Din), temp_V(Din,Din)
COMPLEX(kind=rKind) :: WORK(10*Din)
REAL(kind=rKind) :: RWORK(5*Din)
REAL(kind=rKind), PARAMETER :: Truncation = 1E-13_rKind
INTEGER :: INFO, i

!----------------------------

M = Theta

CALL ZGESVD('A','A',Din,Din,M,Din,temp_lam,temp_U,Din,temp_V,Din,WORK,10*Din,RWORK,INFO)
lambda = 0.0_rKind
S1 = 0.0_rKind
S2 = 0.0_rKind

DO i = 1,Dout
    IF (temp_lam(i)/temp_lam(1)>Truncation) THEN
        lambda(i) = temp_lam(i)
        S1(:,i) = temp_U(:,i)
        S2(:,i) = temp_V(i,:)
    END IF
END DO

END SUBROUTINE SVD

另一个子程序是

SUBROUTINE TwoSiteUpdate(A_update,B_update,lambda_2_update,&
& A,B,lambda_1,lambda_2,U,D1,D2_old,D2_new)

! Designed for 1D MPS.

IMPLICIT NONE

INTEGER, INTENT(IN) :: D1, D2_old, D2_new

COMPLEX(kind=rKind), INTENT(IN) :: A(D1,D2_old,localSize), B(D2_old,D1,localSize)

REAL(kind=rKind), INTENT(IN) :: lambda_1(D1), lambda_2(D2_old)

COMPLEX(kind=rKind), INTENT(IN) :: U(localSize,localSize,localSize,localSize)

COMPLEX(kind=rKind), INTENT(OUT) :: A_update(D1,D2_new,localSize), B_update(D2_new,D1,localSize)

REAL(kind=rKind), INTENT(OUT) :: lambda_2_update(D2_new)


COMPLEX(kind=rKind) :: temp_A(D1,D2_old,localSize)

COMPLEX(kind=rKind) :: temp_B(D2_old,D1,localSize)

COMPLEX(kind=rKind) :: Theta(D1,localSize,D1,localSize)

COMPLEX(kind=rKind) :: Theta_S(D1*localSize,D1*localSize)

COMPLEX(kind=rKind) :: S1(D1*localSize,D2_new), S2(D1*localSize,D2_new)

COMPLEX(kind=rKind) :: S1_tmp(D1,localSize,D2_new)

COMPLEX(kind=rKind) :: S2_tmp(D1,localSize,D2_new)


REAL(kind=rKind), PARAMETER :: Truncation=1E-16_rKind
COMPLEX(kind=rKind) :: tmp_ele
INTEGER :: i1,i2,i3,im
INTEGER :: mA1,mB1,mA2,mB2
INTEGER :: i



!------------------- add lambda on A, B ----------------------

temp_A = 0.0_rKind
temp_B = 0.0_rKind

DO im = 1,localSize
    DO i2 = 1,D2_old
        DO i1 = 1,D1

            IF(lambda_1(i1)>Truncation) THEN
                temp_A(i1,i2,im) = A(i1,i2,im)*SQRT(lambda_1(i1))
                temp_B(i2,i1,im) = B(i2,i1,im)*SQRT(lambda_1(i1))
            END IF

        END DO
    END DO
END DO

!--------------- svd on Theta ----------------

Theta = 0.0_rKind

DO mB2 = 1,localSize
    DO i2 = 1,D1
        DO mA2 = 1,localSize
            DO i1 = 1,D1

                tmp_ele = 0.0_rKind
                DO mB1 = 1,localSize
                    DO mA1 = 1,localSize
                        DO i3 = 1,D2_old
                            tmp_ele = tmp_ele + U(mA2,mB2,mA1,mB1)*temp_A(i1,i3,mA1)*temp_B(i3,i2,mB1)
                        END DO
                    END DO
                END DO
                Theta(i1,mA2,i2,mB2) = tmp_ele

            END DO
        END DO
    END DO
END DO


Theta_S = RESHAPE(Theta,SHAPE=(/D1*localSize,D1*localSize/))

CALL SVD(Theta_S,S1,S2,lambda_2_update,D1*localSize,D2_new)

lambda_2_update = lambda_2_update/lambda_2_update(1)

S1_tmp = RESHAPE(S1,SHAPE=(/D1,localSize,D2_new/))

S2_tmp = RESHAPE(S2,SHAPE=(/D1,localSize,D2_new/))

!---------------- update A, B ----------------

A_update = 0.0_rKind
B_update = 0.0_rKind

DO im = 1,localSize
    DO i2 = 1,D2_new
        DO i1 = 1,D1
            IF (lambda_1(i1)>Truncation) THEN
                A_update(i1,i2,im) = S1_tmp(i1,im,i2)*SQRT(lambda_2_update(i2))/SQRT(lambda_1(i1))
                B_update(i2,i1,im) = S2_tmp(i1,im,i2)*SQRT(lambda_2_update(i2))/SQRT(lambda_1(i1))
            END IF
        END DO
    END DO
END DO

END SUBROUTINE TwoSiteUpdate

以上两个子程序是程序的耗时部分。没有这两个子程序,程序就可以了,内存不会增加。有了这两个,当我使用并行版本时,内存会增加。

1 个答案:

答案 0 :(得分:0)

您可以检查是否在循环中使用了带可分配项的复合类型。您也可以使用OpenMP并行指令和-openmp编译选项来手动控制并行的位置,而不是自动并行,如下所示

$omp parallel do
do i = 1, n
...
$ ifort -openmp foo.f90