Julia running an order of magnitude slower than python

I was trying to port a python code into Julia to try it out (both codes are given below). Julia is running about 10 times slower on my machine than python. What am I doing wrong? I am pretty new to Julia, so appreciate any help.

Here is the python code:

import matplotlib.pyplot as plt
from numba import jit
from numpy import random
import time

N=1000
kplus=2
kminus=1
T=20
T_th=10
sdt=1
frac = 0.5
threshold = frac*N

@jit(nopython=True)
def run(kp, km):
    base=np.ones(N)
    mb=np.arange(N)
    m=N
    th=0
    time_data = np.zeros(int(T/sdt))
    histogram=np.zeros(N+1)
    time_data[0]=N
    time_temp = sdt
    while th<T:
        if m==0:
            #print(th)
            break
        
        if th>time_temp:
            time_data[int(time_temp/sdt)] = m
            if th>T_th:
                histogram[int(m)] += 1
            #time_data[int(time_temp/sdt)] = N if m>threshold else 0
            time_temp = time_temp + 1*sdt
            
        kt=m*(kp+km)
        th=th+random.exponential(1/kt)
        ran=kt*random.rand()
        index=int(ran/(kp+km))
        rem=ran-index*(kp+km)
        #print(rem)
        if rem<km:
            base[mb[index]]=0
            tmp=mb[index]
            mb[index]=mb[m-1]
            mb[m-1]=tmp
            m=m-1
                
        else:
            pos=random.randint(N)
            if base[pos]==0:
                base[pos]=1
                mb[m]=pos
                m=m+1
                
            
    return time_data, histogram
    
    
num_runs = 1000
time_data_avg = np.zeros(int(T/sdt))
td_var=np.zeros(int(T/sdt))
hist=np.zeros(N+1)

for _ in range(num_runs):
    m,l = run(2,1)
    hist += l
    time_data_avg += m/num_runs
    td_var += m*m/num_runs
td_var -= time_data_avg**2

Here is the corresponding Julia code that I have written:

using Random
using Distributions
using Plots

N=1000
kplus=2
kminus=1
T=20
sdt=1
frac = 0.5
threshold = frac*N

function run(kp,km)
    base=fill(1,N)
    mb=collect(1:N)
    m=N
    th=0
    time_data = fill(0,floor(Int, T/sdt))
    time_data[1]=N
    time_temp = sdt
    
    while th<T
        # println(th, ' ', m)
        if m==0
            println(th)
            break
        end
        
        if th>time_temp
            time_data[ceil(Int, time_temp/sdt)+1]=m
            time_temp += sdt
        end
        
        kt=m*(kp+km)
        th=th+rand(Exponential(1/kt))
        ran=kt*rand(Float64)
        index=floor(Int,ran/(kp+km))
        rem=ran-index*(kp+km)
        index=index+1
        
        if rem<km
            base[mb[index]]=0
            tmp=mb[index]
            mb[index]=mb[m]
            mb[m]=tmp
            m=m-1
        else
            pos=rand(1:N)
            if base[pos]==0
                base[pos]=1
                mb[m+1]=pos
                m=m+1
            end
        end
        
    end
    return time_data
end


function sample(num_runs)
    time_data_avg = fill(0.0, floor(Int, T/sdt))
    td_var=fill(0.0, floor(Int, T/sdt))
    for i in 1:num_runs
        m = run(2,1)
        time_data_avg .+= m/num_runs
        td_var .+= m.*(m/num_runs)
    end
    td_var .-= time_data_avg.^2
    
    return time_data_avg, td_var
end

@time begin
   tm,tv=sample(1000)
end

For the python code, I measure time with bash time command. I have also made sure that numba is not parallelizing.

Solution

I could not compare your codes directly on my machine as your Python code does not execute correctly producing the timings. However, in Julia it is simple to fix the issues that @DNF and @Przemyslaw Szufel mentioned above without changing your code. Just wrap everything in a function or let block. Then you do not need to rewrite anything in your functions. I have run such a test and the results are:

Timing of your Julia code "as is"

 22.927925 seconds (360.97 M allocations: 5.731 GiB, 3.36% gc time, 1.49% compilation time)

(it is immediately visible that something is wrong with the code as you have 361M allocations that amount to 5.7GiB of RAM)

Timing of your code if I wrap it in a function

What I do is just add two lines:

function fun()

at the top and one extra

end

at the bottom.

The timing is:

julia> fun()
  0.779523 seconds (5.00 k allocations: 16.144 MiB, 0.91% gc time)

(allocations went significantly down; this could be further optimized, but I understand you want to compare exactly the same code in both languages)

So you get 22.9 / 0.78 = 29.3 speedup. Which, given what you reported should be around 3x faster than Python.

Small coding style comment: what I needed to change is removing one blank line before function sample(num_runs) as normally Julia REPL assumes that passing two blank lines ends a definition (and when wrapping everything in a function you did not want its definition to be finished).

Having said that - this is not how normally one would write a Julia program as others have already commented. I recommend you check the Performance tips section of the Julia Manual for details.

EDIT

Here is a quick-and dirty rewrite of your code that does less allocations and is faster:

function f()
    N=1000
    T=20
    sdt=1
    base=fill(1,N)
    mb=collect(1:N)
    time_data = fill(0,floor(Int, T/sdt))

    function run(kp,km)
        fill!(base, 1)
        mb .= 1:N
        fill!(time_data, 0)
        m=N
        th=0
        time_data[1]=N
        time_temp = sdt
        
        @inbounds while th<T
            # println(th, ' ', m)
            if m==0
                println(th)
                break
            end
            
            if th>time_temp
                time_data[ceil(Int, time_temp/sdt)+1]=m
                time_temp += sdt
            end
            
            kt=m*(kp+km)
            th=th+rand(Exponential(1/kt))
            ran=kt*rand(Float64)
            index=floor(Int,ran/(kp+km))
            rem=ran-index*(kp+km)
            index=index+1
            
            if rem<km
                base[mb[index]]=0
                tmp=mb[index]
                mb[index]=mb[m]
                mb[m]=tmp
                m=m-1
            else
                pos=rand(1:N)
                if base[pos]==0
                    base[pos]=1
                    mb[m+1]=pos
                    m=m+1
                end
            end
            
        end
        return time_data
    end

    function sample(num_runs)
        time_data_avg = fill(0.0, floor(Int, T/sdt))
        td_var=fill(0.0, floor(Int, T/sdt))
        for i in 1:num_runs
            m = run(2,1)
            time_data_avg .+= m ./ num_runs
            td_var .+= m.*(m ./ num_runs)
        end
        td_var .-= time_data_avg.^2
        
        return time_data_avg, td_var
    end

    @time begin
        tm,tv=sample(1000)
    end
end

It does:

julia> f();
  0.739664 seconds (2 allocations: 448 bytes)

While the same using your original code is:

julia> f();
  0.778454 seconds (5.00 k allocations: 16.144 MiB)

(so the difference in timing is not super big but noticeable, but allocations are significantly lower, which means that less strain on GC; I have not analyzed the logic of your code - I only changed the memory management)