The openmp matrix multiplication

I try to write a Openmp based matrix multiplication code. The multiplication of matrix mm and matrix mmt is diagonal matrix and equal to one. I try normal calculation and Openmp. The normal result is correct, however the Openmp result is wrong. I think it should be relative to the Openmp utilization.

program main
implicit none
double precision,dimension(0:8,0:8)::MM,MMT,MTEMP

MM=reshape((/ 1 ,   1   ,   1   ,   1   ,   1   ,   1   ,   1   ,   1   ,   1   ,   &
-4  ,   -1  ,   -1  ,   -1  ,   -1  ,   2   ,   2   ,   2   ,   2   ,   &
4   ,   -2  ,   -2  ,   -2  ,   -2  ,   1   ,   1   ,   1   ,   1   ,   &
0   ,   1   ,   0   ,   -1  ,   0   ,   1   ,   -1  ,   -1  ,   1   ,   &
0   ,   -2  ,   0   ,   2   ,   0   ,   1   ,   -1  ,   -1  ,   1   ,   &
0   ,   0   ,   1   ,   0   ,   -1  ,   1   ,   1   ,   -1  ,   -1  ,   &
0   ,   0   ,   -2  ,   0   ,   2   ,   1   ,   1   ,   -1  ,   -1  ,   &
0   ,   1   ,   -1  ,   1   ,   -1  ,   0   ,   0   ,   0   ,   0   ,   &
0   ,   0   ,   0   ,   0   ,   0   ,   1   ,   -1  ,   1   ,   -1  /),shape(MM))



MMT=1.d0/36.d0 * reshape  ((/ 4 ,   -4  ,   4   ,   0   ,   0   ,   0   ,   0   ,   0   ,   0   ,   &
                4   ,   -1  ,   -2  ,   6   ,   -6  ,   0   ,   0   ,   9   ,   0   ,   &
                4   ,   -1  ,   -2  ,   0   ,   0   ,   6   ,   -6  ,   -9  ,   0   ,   &
                4   ,   -1  ,   -2  ,   -6  ,   6   ,   0   ,   0   ,   9   ,   0   ,   &
                4   ,   -1  ,   -2  ,   0   ,   0   ,   -6  ,   6   ,   -9  ,   0   ,   &
                4   ,   2   ,   1   ,   6   ,   3   ,   6   ,   3   ,   0   ,   9   ,   &
                4   ,   2   ,   1   ,   -6  ,   -3  ,   6   ,   3   ,   0   ,   -9  ,   &
                4   ,   2   ,   1   ,   -6  ,   -3  ,   -6  ,   -3  ,   0   ,   9   ,   &
                4   ,   2   ,   1   ,   6   ,   3   ,   -6  ,   -3  ,   0   ,   -9  /),shape(MMT))

!$OMP PARALLEL
          call multi(mm,mmt,mtemp)

                PRINT*,MTEMP
!$OMP END PARALLEL


endprogram main

subroutine multi(m1,m2,m3)
double precision,dimension(0:8,0:8)::m1,m2,m3
double precision,dimension(0:80)::mm1,mm2
DOUBLE PRECISION::TEMP
integer::i,j,k
!$OMP DO
do j=0,8
    do i=0,8
        mm1(j*9+i)=m1(i,j)
        mm2(i*9+j)=m2(i,j)
    enddo
enddo
!$OMP ENDDO
!$OMP DO PRIVATE(TEMP,I,J,K)
do j=0,8
    do i=0,8
        temp=0
        do k=0,8
            temp=temp+mm1(j*9+k)*mm2(i*9+k)
        enddo
        m3(i,j)=temp
    enddo
enddo
!$OMP ENDDO
return


endsubroutine

I give the normal version below, and if you compute this, the result is diagnonal 1 matrix.

program main
implicit none
double precision,dimension(0:8,0:8)::MM,MMT,MTEMP

MM=reshape((/ 1 ,   1   ,   1   ,   1   ,   1   ,   1   ,   1   ,   1   ,   1   ,   &
-4  ,   -1  ,   -1  ,   -1  ,   -1  ,   2   ,   2   ,   2   ,   2   ,   &
4   ,   -2  ,   -2  ,   -2  ,   -2  ,   1   ,   1   ,   1   ,   1   ,   &
0   ,   1   ,   0   ,   -1  ,   0   ,   1   ,   -1  ,   -1  ,   1   ,   &
0   ,   -2  ,   0   ,   2   ,   0   ,   1   ,   -1  ,   -1  ,   1   ,   &
0   ,   0   ,   1   ,   0   ,   -1  ,   1   ,   1   ,   -1  ,   -1  ,   &
0   ,   0   ,   -2  ,   0   ,   2   ,   1   ,   1   ,   -1  ,   -1  ,   &
0   ,   1   ,   -1  ,   1   ,   -1  ,   0   ,   0   ,   0   ,   0   ,   &
0   ,   0   ,   0   ,   0   ,   0   ,   1   ,   -1  ,   1   ,   -1  /),shape(MM))



MMT=1.d0/36.d0 * reshape  ((/ 4 ,   -4  ,   4   ,   0   ,   0   ,   0   ,   0   ,   0   ,   0   ,   &
                4   ,   -1  ,   -2  ,   6   ,   -6  ,   0   ,   0   ,   9   ,   0   ,   &
                4   ,   -1  ,   -2  ,   0   ,   0   ,   6   ,   -6  ,   -9  ,   0   ,   &
                4   ,   -1  ,   -2  ,   -6  ,   6   ,   0   ,   0   ,   9   ,   0   ,   &
                4   ,   -1  ,   -2  ,   0   ,   0   ,   -6  ,   6   ,   -9  ,   0   ,   &
                4   ,   2   ,   1   ,   6   ,   3   ,   6   ,   3   ,   0   ,   9   ,   &
                4   ,   2   ,   1   ,   -6  ,   -3  ,   6   ,   3   ,   0   ,   -9  ,   &
                4   ,   2   ,   1   ,   -6  ,   -3  ,   -6  ,   -3  ,   0   ,   9   ,   &
                4   ,   2   ,   1   ,   6   ,   3   ,   -6  ,   -3  ,   0   ,   -9  /),shape(MMT))


                call multi(mm,mmt,mtemp)

                PRINT*,MTEMP



endprogram main

subroutine multi(m1,m2,m3)
double precision,dimension(0:8,0:8)::m1,m2,m3
double precision,dimension(0:80)::mm1,mm2
DOUBLE PRECISION::TEMP
integer::i,j,k

do j=0,8
    do i=0,8
        mm1(j*9+i)=m1(i,j)
        mm2(i*9+j)=m2(i,j)
    enddo
enddo


do j=0,8
    do i=0,8
        temp=0
        do k=0,8
            temp=temp+mm1(j*9+k)*mm2(i*9+k)
        enddo
        m3(i,j)=temp
    enddo
enddo

return


  endsubroutine

Solution

To solve your problem one solution is to place the parallel region inside the multi subroutine. This code gives the same result as the serial one:

subroutine multi(m1,m2,m3)
double precision,dimension(0:8,0:8)::m1,m2,m3
double precision,dimension(0:80)::mm1,mm2
DOUBLE PRECISION::TEMP
integer::i,j,k
!$OMP PARALLEL
    !$OMP DO
    do j=0,8
        do i=0,8
            mm1(j*9+i)=m1(i,j)
            mm2(i*9+j)=m2(i,j)
        enddo
    enddo
    !$OMP ENDDO
    !$OMP DO PRIVATE(TEMP,I,J,K)
    do j=0,8
        do i=0,8
            temp=0
            do k=0,8
                temp=temp+mm1(j*9+k)*mm2(i*9+k)
            enddo
            m3(i,j)=temp
        enddo
    enddo
    !$OMP ENDDO
!$OMP END PARALLEL
return

Note that the workload is very small, so it may not be faster than the serial code. Note also that if you do not need mm1 and mm2 arrays later then you do not have to calculate them:

subroutine multi(m1,m2,m3)
double precision,dimension(0:8,0:8)::m1,m2,m3
DOUBLE PRECISION::TEMP
integer::i,j,k
!$OMP PARALLEL
    !$OMP DO PRIVATE(TEMP,I,J,K)
    do j=0,8
        do i=0,8
            temp=0
            do k=0,8
                temp=temp+m1(k,j)*m2(i,k)
            enddo
            m3(i,j)=temp
        enddo
    enddo
    !$OMP ENDDO
!$OMP END PARALLEL
return