My question is about what is the fastest way to provide initial values to an array in Fortran 90 or above version, for both serial and OpenMP. I can try
(a) A = 0.0
; or
(b) do nested loops for A(i, j...) = 0.0
and adjust the order of loops to fit the vectorization (innermost to the first argument)
I somehow remembered, but cannot find the reference after googled a few times, that complier will try to do vectorization for (a). Here is the test for serial level (sorry the code is messy, not procedure-oriented, and some variable names etc adopted from previous replies)
Program vectorization_test
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
real :: A(20,20,20,20), sum_time
integer :: i,j,k,l,n,m, m_iter
Integer( li ) :: start, finish, rate
m_iter = 10
n = 20
sum_time = 0.0
do m = 1, m_iter
Call System_clock( start, rate )
A= 0.0
Call System_clock( finish, rate )
write(*,*) 'time 1', Real( finish - start, wp ) / rate
sum_time = sum_time + Real( finish - start, wp ) / rate
end do
write(*,*) 'average time', sum_time / m_iter
sum_time = 0.0
do m = 1, m_iter
Call System_clock( start, rate )
do l = 1, n
do k = 1, n
do j = 1, n
do i = 1, n
A(i,j,k,l) = 0.0
end do
end do
end do
end do
Call System_clock( finish, rate )
write(*,*) 'time 2', Real( finish - start, wp ) / rate
sum_time = sum_time + Real( finish - start, wp ) / rate
end do
write(*,*) 'average time 2', sum_time / m_iter
sum_time = 0.0
do m = 1, m_iter
Call System_clock( start, rate )
do l = 1, n
do j = 1, n
do k = 1, n
do i = 1, n
A(i,j,k,l) = 0.0
end do
end do
end do
end do
Call System_clock( finish, rate )
write(*,*) 'time 3', Real( finish - start, wp ) / rate
sum_time = sum_time + Real( finish - start, wp ) / rate
end do
write(*,*) 'average time 3', sum_time / m_iter
sum_time = 0.0
do m = 1, m_iter
Call System_clock( start, rate )
do i = 1, n
do j = 1, n
do k = 1, n
do l = 1, n
A(i,j,k,l) = 0.0
end do
end do
end do
end do
Call System_clock( finish, rate )
write(*,*) 'time 4', Real( finish - start, wp ) / rate
sum_time = sum_time + Real( finish - start, wp ) / rate
end do
write(*,*) 'average time 4', sum_time / m_iter
end program vectorization_test
I got average time 3.76699973E-05, average time 2 5.98790008E-04, average time 3 6.55650045E-04, average time 4 3.10386019E-03
from gfortran-11 -o3
on my laptop with 16 GB memory. On computing center with 384 GB memory, I got average time 4.75034976E-05, average time 2 , 4.47604398E-04, average time 3 4.70327737E-04, average time 4 4.14085982E-04
. Larger dimensions similar trend.
Not sure if this holds for other compilers. Seems the innermost loop is most critical for vectorization.
So my questions are
(1) is there any reference for this issue on vectorization and initialization of array;
(2) if I use OpenMP, shall I use a single loop for one variable, A(i,:,:,:) = 0.0
something like that?
P.S. The initialization of array is mostlikely not the bottleneck, so the question is more belong to my curious.
Try varying to first index the fastest
Call System_clock( start, rate )
do l = 1, n
do k = 1, n
do j = 1, n
do i = 1, n
A(i,j,k,l) = 0.0
end do
end do
end do
end do
Call System_clock( finish, rate )
Since Fortran is column-major, this means the first index puts values as near as they can be and thus utilizing the CPU cache to avoid excess memory access which is 100× slower than cache access.
In the end I don't think it would make much of a difference, as the compiler is pretty good at optimizing code.
In my testing with ifort
in release build with parallel I get two sets of results based on the floating point setting:
I measured initializations per second:
Method | /fp:fast |
/fp:precise |
Description |
---|---|---|---|
LOOP | 440.9171 | 403.2258 | Four loops |
ATOM | 443.4590 | 432.5259 | a=x |
SPAN | 443.8526 | 457.8755 | a(:,:,:,:)=x |
PARA | 445.0378 | 438.4042 | $omp parallel |
Code listing:
program Console1
implicit none
! Variables
integer, parameter :: n = 60, repeat=1000
integer :: iter
real :: x, a(n,n,n,n)
integer(8) :: tic, toc, rate
! Body of Console1
x = 4*atan(1.0)
call SYSTEM_CLOCK(tic,rate)
do iter=1, repeat
call r_fill_loop(a,x)
end do
call SYSTEM_CLOCK(toc,rate)
print *, "LOOP", (rate*repeat)/real(toc-tic), "ips"
call SYSTEM_CLOCK(tic,rate)
do iter=1, repeat
call r_fill_atom(a,x)
end do
call SYSTEM_CLOCK(toc,rate)
print *, "ATOM", (rate*repeat)/real(toc-tic), "ips"
call SYSTEM_CLOCK(tic,rate)
do iter=1, repeat
call r_fill_span(a,x)
end do
call SYSTEM_CLOCK(toc,rate)
print *, "SPAN", (rate*repeat)/real(toc-tic), "ips"
call SYSTEM_CLOCK(tic,rate)
do iter=1, repeat
call r_fill_parallel(a,x)
end do
call SYSTEM_CLOCK(toc,rate)
print *, "PARA", (rate*repeat)/real(toc-tic), "ips"
contains
pure subroutine r_fill_loop(a,x)
real, intent(out) :: a(:,:,:,:)
real, intent(in) :: x
integer :: n, m, g, h
integer :: i,j,k,l
n = size(a,1)
m = size(a,2)
g = size(a,3)
h = size(a,4)
do l=1, h
do k=1, g
do j=1, m
do i=1,n
a(i,j,k,l) = x
end do
end do
end do
end do
end subroutine
pure subroutine r_fill_atom(a,x)
real, intent(out) :: a(:,:,:,:)
real, intent(in) :: x
a = x
end subroutine
pure subroutine r_fill_parallel(a,x)
real, intent(out) :: a(:,:,:,:)
real, intent(in) :: x
integer :: n, m, g, h
integer :: i,j,k,l
n = size(a,1)
m = size(a,2)
g = size(a,3)
h = size(a,4)
!$OMP PARALLEL
!$OMP DO
do l=1, h
do k=1, g
do j=1, m
do i=1,n
a(i,j,k,l) = x
end do
end do
end do
end do
!$OMP END DO
!$OMP END PARALLEL
end subroutine
pure subroutine r_fill_span(a,x)
real, intent(out) :: a(:,:,:,:)
real, intent(in) :: x
a(:,:,:,:) = x
end subroutine
end program Console1
Side note on precision and roundoff errors. I did a sum(a)
in the end and compared it to n*n*n*n*x = 40715040.79
which is the expected value.
With /fp:fast=2
I get sum(a) = 40738716.0
With /fp:precise
I get sum(a) = 46579532.0
The above is very surprising that the precise floating-point model gives far worse accuracy compared to the fast model.
here are the compiler options I used:
[IFORT]
/nologo /O3 /Qparallel /heap-arrays200 /fp:fast=2 /module:x64\Release\ /object:
x64\Release\ /Fdx64\Release\vc150.pdb /libs:dll /threads /c /Qlocation,link,C:\
Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.16.
27023\bin\HostX64\x64 /Qm64