Search code examples
pythonjulianumba

Julia running an order of magnitude slower than python


I was trying to port a python code into Julia to try it out (both codes are given below). Julia is running about 10 times slower on my machine than python. What am I doing wrong? I am pretty new to Julia, so appreciate any help.

Here is the python code:

import matplotlib.pyplot as plt
from numba import jit
from numpy import random
import time

N=1000
kplus=2
kminus=1
T=20
T_th=10
sdt=1
frac = 0.5
threshold = frac*N

@jit(nopython=True)
def run(kp, km):
    base=np.ones(N)
    mb=np.arange(N)
    m=N
    th=0
    time_data = np.zeros(int(T/sdt))
    histogram=np.zeros(N+1)
    time_data[0]=N
    time_temp = sdt
    while th<T:
        if m==0:
            #print(th)
            break
        
        if th>time_temp:
            time_data[int(time_temp/sdt)] = m
            if th>T_th:
                histogram[int(m)] += 1
            #time_data[int(time_temp/sdt)] = N if m>threshold else 0
            time_temp = time_temp + 1*sdt
            
        kt=m*(kp+km)
        th=th+random.exponential(1/kt)
        ran=kt*random.rand()
        index=int(ran/(kp+km))
        rem=ran-index*(kp+km)
        #print(rem)
        if rem<km:
            base[mb[index]]=0
            tmp=mb[index]
            mb[index]=mb[m-1]
            mb[m-1]=tmp
            m=m-1
                
        else:
            pos=random.randint(N)
            if base[pos]==0:
                base[pos]=1
                mb[m]=pos
                m=m+1
                
            
    return time_data, histogram
    
    
num_runs = 1000
time_data_avg = np.zeros(int(T/sdt))
td_var=np.zeros(int(T/sdt))
hist=np.zeros(N+1)

for _ in range(num_runs):
    m,l = run(2,1)
    hist += l
    time_data_avg += m/num_runs
    td_var += m*m/num_runs
td_var -= time_data_avg**2

Here is the corresponding Julia code that I have written:

using Random
using Distributions
using Plots

N=1000
kplus=2
kminus=1
T=20
sdt=1
frac = 0.5
threshold = frac*N

function run(kp,km)
    base=fill(1,N)
    mb=collect(1:N)
    m=N
    th=0
    time_data = fill(0,floor(Int, T/sdt))
    time_data[1]=N
    time_temp = sdt
    
    while th<T
        # println(th, ' ', m)
        if m==0
            println(th)
            break
        end
        
        if th>time_temp
            time_data[ceil(Int, time_temp/sdt)+1]=m
            time_temp += sdt
        end
        
        kt=m*(kp+km)
        th=th+rand(Exponential(1/kt))
        ran=kt*rand(Float64)
        index=floor(Int,ran/(kp+km))
        rem=ran-index*(kp+km)
        index=index+1
        
        if rem<km
            base[mb[index]]=0
            tmp=mb[index]
            mb[index]=mb[m]
            mb[m]=tmp
            m=m-1
        else
            pos=rand(1:N)
            if base[pos]==0
                base[pos]=1
                mb[m+1]=pos
                m=m+1
            end
        end
        
    end
    return time_data
end


function sample(num_runs)
    time_data_avg = fill(0.0, floor(Int, T/sdt))
    td_var=fill(0.0, floor(Int, T/sdt))
    for i in 1:num_runs
        m = run(2,1)
        time_data_avg .+= m/num_runs
        td_var .+= m.*(m/num_runs)
    end
    td_var .-= time_data_avg.^2
    
    return time_data_avg, td_var
end

@time begin
   tm,tv=sample(1000)
end

For the python code, I measure time with bash time command. I have also made sure that numba is not parallelizing.


Solution

  • I could not compare your codes directly on my machine as your Python code does not execute correctly producing the timings. However, in Julia it is simple to fix the issues that @DNF and @Przemyslaw Szufel mentioned above without changing your code. Just wrap everything in a function or let block. Then you do not need to rewrite anything in your functions. I have run such a test and the results are:

    Timing of your Julia code "as is"

     22.927925 seconds (360.97 M allocations: 5.731 GiB, 3.36% gc time, 1.49% compilation time)
    

    (it is immediately visible that something is wrong with the code as you have 361M allocations that amount to 5.7GiB of RAM)

    Timing of your code if I wrap it in a function

    What I do is just add two lines:

    function fun()
    

    at the top and one extra

    end
    

    at the bottom.

    The timing is:

    julia> fun()
      0.779523 seconds (5.00 k allocations: 16.144 MiB, 0.91% gc time)
    

    (allocations went significantly down; this could be further optimized, but I understand you want to compare exactly the same code in both languages)

    So you get 22.9 / 0.78 = 29.3 speedup. Which, given what you reported should be around 3x faster than Python.

    Small coding style comment: what I needed to change is removing one blank line before function sample(num_runs) as normally Julia REPL assumes that passing two blank lines ends a definition (and when wrapping everything in a function you did not want its definition to be finished).

    Having said that - this is not how normally one would write a Julia program as others have already commented. I recommend you check the Performance tips section of the Julia Manual for details.

    EDIT

    Here is a quick-and dirty rewrite of your code that does less allocations and is faster:

    function f()
        N=1000
        T=20
        sdt=1
        base=fill(1,N)
        mb=collect(1:N)
        time_data = fill(0,floor(Int, T/sdt))
    
        function run(kp,km)
            fill!(base, 1)
            mb .= 1:N
            fill!(time_data, 0)
            m=N
            th=0
            time_data[1]=N
            time_temp = sdt
            
            @inbounds while th<T
                # println(th, ' ', m)
                if m==0
                    println(th)
                    break
                end
                
                if th>time_temp
                    time_data[ceil(Int, time_temp/sdt)+1]=m
                    time_temp += sdt
                end
                
                kt=m*(kp+km)
                th=th+rand(Exponential(1/kt))
                ran=kt*rand(Float64)
                index=floor(Int,ran/(kp+km))
                rem=ran-index*(kp+km)
                index=index+1
                
                if rem<km
                    base[mb[index]]=0
                    tmp=mb[index]
                    mb[index]=mb[m]
                    mb[m]=tmp
                    m=m-1
                else
                    pos=rand(1:N)
                    if base[pos]==0
                        base[pos]=1
                        mb[m+1]=pos
                        m=m+1
                    end
                end
                
            end
            return time_data
        end
    
        function sample(num_runs)
            time_data_avg = fill(0.0, floor(Int, T/sdt))
            td_var=fill(0.0, floor(Int, T/sdt))
            for i in 1:num_runs
                m = run(2,1)
                time_data_avg .+= m ./ num_runs
                td_var .+= m.*(m ./ num_runs)
            end
            td_var .-= time_data_avg.^2
            
            return time_data_avg, td_var
        end
    
        @time begin
            tm,tv=sample(1000)
        end
    end
    

    It does:

    julia> f();
      0.739664 seconds (2 allocations: 448 bytes)
    

    While the same using your original code is:

    julia> f();
      0.778454 seconds (5.00 k allocations: 16.144 MiB)
    

    (so the difference in timing is not super big but noticeable, but allocations are significantly lower, which means that less strain on GC; I have not analyzed the logic of your code - I only changed the memory management)