python optimization jit numba runge-kutta

Ways to Optimize Independent N-Body Simulations

I am relatively new to parallel computing and the Numba package. I am looking for optimization methods for my stupendously parallel N-body simulation. I've applied everything I know so far with Numpy arrays, JIT compliers, and multiprocessing. However, I'm still not getting the speed I desire (I've seen videos where their codes are MUCH faster still)

What I have currently is a rather simple python integrator using Runge-Kutta Integration and two equations of motion. I work with numerical integrators a lot in my field so I would definitely like to pick up a few more tricks from you guys.

I have posted my code below, but essentially, I have one main function called "Motion" which takes 2 initial conditions and integrate their motion for a set amount of time. I have JITTED this function and all the functions it called upon iteratively: "RK4", "ODE", "Electric Field". Lastly, I call the pool function from Multiprocessing to parallelize the "Motion" function and insert different initial conditions for each simulation it runs.

Again, I've implemented every types of optimization I'm aware of, however I'm still not very happy with its speed. I've posted my code below. If anyone can spot a piece of algorithm that could be further optimized, that would be extremely helpful and educational (for me at least)! Thank you for your time.

import numpy as np
import matplotlib.pyplot as plt
from numba import njit, prange
from time import time
from tqdm import tqdm
import multiprocessing as mp
from IPython.display import clear_output
from scipy import interpolate


"Electric Field Information"
A = np.float32(1.00E-04)
N_waves = np.int(19)
frequency =  np.linspace(37.5,46.5,N_waves)*1e-3 #Set of frequencies used for Electric Field 
m = np.int(20) #Azimuthal Wave Number
sigma = np.float32(0.5) #Gaussian Width of E wave in L
zeta = np.float32(1)

"Particle Information"
N_Particles = np.int(10000)
q = np.float32(-1) #Charge of electron
mass = np.float32(0.511e6) #Mass of Proton eV/c^2
FirstAdiabatic = np.float32(2000e10) #MeV/Gauss Adiabatic Invariant


"Runge-Kutta Paramters"
Total_Time = np.float32(10) #hours
Step_Size = np.float32(0.2) #second 
Plot_Time = np.float32(60) #seconds
time_array = np.arange(0, Total_Time*3600+Step_Size, Step_Size) #Convert to seconds and Add End Point
N_points = len(time_array)

Skip_How_Many = int(Plot_Time/Step_Size) #Used to shorten our data set and save RAM

"Constants"    
Beq = np.float64(31221.60592e-9) #nT
Re = np.float32(6371e3) #Meters
c = np.float32(2.998e8) #m/s

"Start Electric Field Code"
def wave_peak(omega): #Called once so no need to JIT or Optimize this
    L_sample = np.linspace(1,10,100) 
    phidot = -3*FirstAdiabatic / (q* (L_sample*Re)**2 * np.sqrt(1+ (2*FirstAdiabatic*Beq/ (mass*L_sample**3)) ) )
    phidot_to_L = interpolate.interp1d(phidot,L_sample, kind = 'cubic')
    L0i = phidot_to_L(omega/m)
    return L0i 
omega = 2*np.pi*frequency
L0i_wave = wave_peak(omega)
Phi0i_wave = np.linspace(0,2*np.pi,N_waves)
np.random.shuffle(Phi0i_wave)

@njit(nogil= True)
def Electric_Field(t,r):
    E0 = A*np.exp(-(r[0]-L0i_wave)**2 / (2*sigma**2))
    Delta = np.arctan2( (r[0] * (r[0]-L0i_wave)/sigma**2 - 1), (2*np.pi*r[0]/zeta) )
    Er = E0/m * np.sqrt( (2*np.pi*r[0]/zeta)**2 + (r[0]*(r[0]-L0i_wave)/sigma**2 -1)**2 ) * np.cos(m*r[1] - omega*t + Phi0i_wave + 2*np.pi*r[0]/zeta + Delta)
    Ephi = E0*np.cos(m*r[1] - omega*t + Phi0i_wave + 2*np.pi*r[0]/zeta)
    return np.sum(Er),np.sum(Ephi)
"End of Electric Field Code"

"Particle's ODE - Equation of Motion"
@njit(nogil= True)
def ODE(t,r):
    Er, Ephi = Electric_Field(t,r) #Pull out the electric so we only call it once.
    Ldot = Ephi * r[0]**3 / (Re*Beq)
    Phidot = -Er * r[0]**2 / (Re*Beq) - 3* FirstAdiabatic / (q*r[0]**2*Re**2) * 1/np.sqrt(2*FirstAdiabatic*Beq/ (r[0]**3*mass) + 1)
    return np.array([Ldot,Phidot])

@njit(nogil= True)
def RK4(t,r): #Standard Runge-Kutta Integration Algorthim
    k1 = Step_Size*ODE(t,r)
    k2 = Step_Size*ODE(t+Step_Size/2, r+k1/2)
    k3 = Step_Size*ODE(t+Step_Size/2, r+k2)
    k4 = Step_Size*ODE(t+Step_Size, r+k3)
    return r + k1/6 + k2/3 + k3/3 + k4/6

@njit(nogil= True)
def Motion(L0,Phi0): #Insert Inital Conditions and it will loop through the RK4 integrator and output all its positions.
    L_Array = np.zeros_like(time_array)
    Phi_Array = np.zeros_like(time_array)
    
    L_Array[0] = L0
    Phi_Array[0] = Phi0
    for i in range(1,N_points):
        L_Array[i], Phi_Array[i] = RK4(time_array[i-1], np.array([ L_Array[i-1],Phi_Array[i-1] ]) )
    
    return L_Array[::Skip_How_Many], Phi_Array[::Skip_How_Many] 
    #Skip_How_Many is used to take up less RAM space since we don't need that kind of percsion in our data

# Location = Motion(5,0)
# x = Location[0]*np.cos(Location[1])
# y = Location[0]*np.sin(Location[1])
# plt.plot(x,y,"o", markersize = 0.5)
# ts = time()
# Motion(5,0)
# print('Solo Time:', time() - ts)

"Getting my Arrays ready so I can index it"
Split = int(np.sqrt(N_Particles))
L0i = np.linspace(4.4,5.5,Split)
Phi0i = np.linspace(0,360,Split) / 180 * np.pi
L0_Grid = np.repeat(L0i,Split) 
# ^Here I want to run a meshgrid of L0i and Phi0, so I repeat L0i using this function and mod (%) the index on the Phi Function


#Create Appending Array
results = []
def get_results(result): #Call back to this array from Multiprocessing to append the results it gives per run.
    results.append(result)
    clear_output()
    print("Getting Results %0.2f" %(len(results)/N_Particles * 100), end='\r')
    
if __name__ == '__main__':
    #Call In Multiprocessing
    pool = mp.Pool(mp.cpu_count()) #Counting number of threads to start
    ts = time() #Timing this process. Begins here
    for ii in range(N_Particles): #Not too sure what this does, but it works - I assume it parallelizes this loop
        pool.apply_async(Motion, args = (L0_Grid[ii],Phi0i[int(ii%Split)]), callback=get_results)
        
    pool.close() #I'm not too sure what this does but everyone uses it, and it won't work without it
    pool.join()
    print('Time in MP parallel:', time() - ts) #Output Time

Solution

I think the main reason why your code is slow is because your Runge-Kutta method has fixed time steps. Fancy ODE solvers will select the biggest time step that allows a tolerable amount of error. One example is the LSODA ODE solver from ODEPACK.

Below I've re-written your code using NumbaLSODA. On my computer, it speeds up your code by about 200x.

import numpy as np
import matplotlib.pyplot as plt
from numba import njit, prange
from time import time
from tqdm import tqdm
import multiprocessing as mp
from scipy import interpolate

from NumbaLSODA import lsoda_sig, lsoda
from numba import cfunc
import numba as nb

"Electric Field Information"
A = np.float32(1.00E-04)
N_waves = np.int(19)
frequency =  np.linspace(37.5,46.5,N_waves)*1e-3 #Set of frequencies used for Electric Field 
m = np.int(20) #Azimuthal Wave Number
sigma = np.float32(0.5) #Gaussian Width of E wave in L
zeta = np.float32(1)

"Particle Information"
N_Particles = np.int(10000)
q = np.float32(-1) #Charge of electron
mass = np.float32(0.511e6) #Mass of Proton eV/c^2
FirstAdiabatic = np.float32(2000e10) #MeV/Gauss Adiabatic Invariant

"Runge-Kutta Paramters"
Total_Time = np.float32(10) #hours
Step_Size = np.float32(0.2) #second 
Plot_Time = np.float32(60) #seconds
time_array = np.arange(0, Total_Time*3600+Step_Size, Step_Size) #Convert to seconds and Add End Point
N_points = len(time_array)

Skip_How_Many = int(Plot_Time/Step_Size) #Used to shorten our data set and save RAM

"Constants"    
Beq = np.float64(31221.60592e-9) #nT
Re = np.float32(6371e3) #Meters
c = np.float32(2.998e8) #m/s

"Start Electric Field Code"
def wave_peak(omega): #Called once so no need to JIT or Optimize this
    L_sample = np.linspace(1,10,100) 
    phidot = -3*FirstAdiabatic / (q* (L_sample*Re)**2 * np.sqrt(1+ (2*FirstAdiabatic*Beq/ (mass*L_sample**3)) ) )
    phidot_to_L = interpolate.interp1d(phidot,L_sample, kind = 'cubic')
    L0i = phidot_to_L(omega/m)
    return L0i 
omega = 2*np.pi*frequency
L0i_wave = wave_peak(omega)
Phi0i_wave = np.linspace(0,2*np.pi,N_waves)
np.random.shuffle(Phi0i_wave)

@njit
def Electric_Field(t,r):
    E0 = A*np.exp(-(r[0]-L0i_wave)**2 / (2*sigma**2))
    Delta = np.arctan2( (r[0] * (r[0]-L0i_wave)/sigma**2 - 1), (2*np.pi*r[0]/zeta) )
    Er = E0/m * np.sqrt( (2*np.pi*r[0]/zeta)**2 + (r[0]*(r[0]-L0i_wave)/sigma**2 -1)**2 ) * np.cos(m*r[1] - omega*t + Phi0i_wave + 2*np.pi*r[0]/zeta + Delta)
    Ephi = E0*np.cos(m*r[1] - omega*t + Phi0i_wave + 2*np.pi*r[0]/zeta)
    return np.sum(Er),np.sum(Ephi)
"End of Electric Field Code"

"Particle's ODE - Equation of Motion"
@cfunc(lsoda_sig)
def ODE(t, r_, dr, p):
    r = nb.carray(r_, (2,))
    Er, Ephi = Electric_Field(t,r)
    Ldot = Ephi * r[0]**3 / (Re*Beq)
    Phidot = -Er * r[0]**2 / (Re*Beq) - 3* FirstAdiabatic / (q*r[0]**2*Re**2) * 1/np.sqrt(2*FirstAdiabatic*Beq/ (r[0]**3*mass) + 1)
    dr[0] = Ldot
    dr[1] = Phidot
funcptr = ODE.address
    
@njit
def Motion(L0,Phi0):
    u0 = np.array([L0,Phi0],np.float64)
    data = np.array([5.0])
    usol, success = lsoda(funcptr, u0, time_array, data)
    L_Array = usol[:,0]
    Phi_Array = usol[:,1]
    return L_Array[::Skip_How_Many], Phi_Array[::Skip_How_Many]
    #Skip_How_Many is used to take up less RAM space since we don't need that kind of percsion in our data

Location = Motion(5,0)
x = Location[0]*np.cos(Location[1])
y = Location[0]*np.sin(Location[1])
plt.plot(x,y,"o", markersize = 0.5)
ts = time()
Motion(5,0)
print('Solo Time:', time() - ts)