Why my c++ code is slower than python code doing same thing?

I'm new to the programming. I've tried to make 3 scripts in Python, c++ and Matlab that would calculate Voltage over Inductor in RL circuit using matrices and finite difference method. When i ran those codes it appeared that MatLab took 0.4s to compute, Python took 1s to compute and c++ over 30s. Here I'm posting c++ code:

#include <iostream>
#include <vector>
#include <cmath>
#include <chrono>
#include <Eigen/Dense>
#include "matplotlibcpp.h" 
#include <omp.h>

namespace plt = matplotlibcpp;

double R = 10.0;
double L = 2.0;
double V0 = 5.0;
double t1 = 1.0;
double t2 = 2.0;
double t_start = 0.0;
double t_end = 3.0;

double exact_voltage(double t) {
    if (t < t1) {
        return 0;
    } else if (t1 <= t && t < t2) {
        return V0 * std::exp(-(R / L) * (t - t1));
    } else {
        return -V0 * std::exp(-(R / L) * (t - t2));
    }
}

void solve(double dt, std::vector<double>& solved_s_times, std::vector<double>& solved_V_L) {
    int N = static_cast<int>((t_end - t_start) / dt);

    Eigen::MatrixXd A = Eigen::MatrixXd::Zero(N, N);
    Eigen::VectorXd B = Eigen::VectorXd::Zero(N);

    for (int i = 0; i < N; ++i) {
        A(i, i) = L / dt + R;
        if (i > 0) {
            A(i, i - 1) = -L / dt;
        }
    }

    std::vector<double> solve_times(N);
    for (int i = 0; i < N; ++i) {
        solve_times[i] = t_start + i * dt;
        if (t1 <= solve_times[i] && solve_times[i] < t2) {
            B(i) = V0;
        } else if (solve_times[i] >= t2) {
            B(i) = 0.0;
        }
    }

    Eigen::VectorXd I = A.colPivHouseholderQr().solve(B);

    Eigen::MatrixXd A_dI_dt = Eigen::MatrixXd::Zero(N, N);
    for (int i = 0; i < N - 1; ++i) {
        A_dI_dt(i, i) = -1.0 / dt;
        A_dI_dt(i, i + 1) = 1.0 / dt;
    }

    Eigen::VectorXd dI_dt = A_dI_dt * I;

    solved_V_L.resize(N);
    solved_s_times.resize(N);

    for (int i = 0; i < N; ++i) {
        solved_V_L[i] = L * dI_dt[i];
        solved_s_times[i] = solve_times[i] + dt;
    }
}

int main() {

    std::cout << Eigen::nbThreads() << std::endl;
    std::vector<double> dt_values = {0.002, 0.001, 0.004};

    auto start = std::chrono::high_resolution_clock::now(); // Start pomiaru czasu

    for (double dt : dt_values) {
        std::vector<double> times, V_L;
        solve(dt, times, V_L);
        plt::plot(times, V_L, {{"label", "Symulacja (dt = " + std::to_string(dt) + ")"}});
    }

    auto end = std::chrono::high_resolution_clock::now(); // Koniec pomiaru czasu
    std::chrono::duration<double> duration = end - start;
    std::cout << "Wykonano w: " << duration.count() << "s" << std::endl;

    std::vector<double> exact_times, exact_V_L;
    double dt_exact = 0.001;
    int N_exact = static_cast<int>((t_end - t_start) / dt_exact);

    exact_times.resize(N_exact);
    exact_V_L.resize(N_exact);

    for (int i = 0; i < N_exact; ++i) {
        exact_times[i] = t_start + i * dt_exact;
        exact_V_L[i] = exact_voltage(exact_times[i]);
    }

    plt::plot(exact_times, exact_V_L, {{"label", "Dokładne rozwiązanie"}});
    plt::xlabel("Czas (s)");
    plt::ylabel("Napięcie (V)");
    plt::title("Napięcie na cewce w obwodzie RL - symulacja i dokładne rozwiązanie");
    plt::grid(true);
    plt::legend();
    plt::show();

    return 0;
}

And the Python code:

import numpy as np
import matplotlib.pyplot as plt
import time
from numba import njit

R = 10.0
L = 2.0
V0 = 5.0

t_start = 0.0
t_end = 3
dt_values = [0.002, 0.001, 0.004]

plt.figure(figsize=(12.8, 7.2))

t1 = 1
t2 = 2


# @njit
def exact_voltage(t):
    if t < t1:
        return 0
    elif t1 <= t < t2:
        return V0 * np.exp(-(R / L) * (t - t1))
    else:
        return -V0 * np.exp(-(R / L) * (t - t2))


# @njit
def solve(dt):
    solve_times = np.arange(t_start, t_end, dt)
    N = len(solve_times)

    A = np.zeros((N, N))
    B = np.zeros(N)

    for i in range(N):
        A[i, i] = L / dt + R
        if i > 0:
            A[i, i - 1] = -L / dt

    for i in range(N):
        if t1 <= solve_times[i] < t2:
            B[i] = V0
        elif solve_times[i] >= t2:
            B[i] = 0.0

    I = np.linalg.solve(A, B)

    A_dI_dt = np.zeros((N, N))

    for i in range(N - 1):
        A_dI_dt[i, i] = - 1 / dt
        A_dI_dt[i, i + 1] = 1 / dt

    dI_dt = np.dot(A_dI_dt, I)

    solved_V_L = L * dI_dt

    solved_s_times = np.zeros(N)
    for i in range(N):
        solved_s_times[i] = solve_times[i] + dt

    return solved_s_times, solved_V_L


start = time.time()
for dt_value in dt_values:
    times, V_L = solve(dt_value)
    plt.plot(times, V_L, label=f'Symulacja (dt = {dt_value})')
end = time.time()
print(f"Wykonano w: {end - start}s")
exact_times = np.arange(t_start, t_end, 0.001)
exact_V_L = [exact_voltage(t) for t in exact_times]
plt.plot(exact_times, exact_V_L, 'k--', label='Dokładne rozwiązanie')

plt.xlabel('Czas (s)')
plt.ylabel('Napięcie (V)')
plt.title('Napięcie na cewce w obwodzie RL - symulacja i dokładne rozwiązanie')
plt.grid(True)
plt.legend()
plt.show()

I've tried using different c++ compilers, adding -O3 and -fopenmp to build. I added & to functions too when i watched video on youtube explaining poor c++ performance. None of these actions helped much, they all subtracted around 1s from computation time of c++ code. Maybe someone sees something obvious that i'm missing? I'll be very thankful for explaining why my code is that much slower than Python and MatLab. If you could suggest any changes to the code that will help speed it up it would be great.

Solution

The choice of the solver has the greatest effect here since you spend 100% of the time in matrix solve for a 1500x1500 matrix. Matlab uses Intel-MKL under the hood, and numpy might be using it depending on where you download it from, while Eigen doesn't use Intel-MKL by default, see Using Intel MKL from Eigen

Since most of the matrix elements are zeros you can use a sparse matrix solver, i am not super familiar with all sparse solvers, so i will use the Eigen::SimplicialCholesky from the eigen examples

Also you need to compile with optimizations, for MSVC i used /O2 /arch:avx512 which roughly translate to gcc -O3 -march=native the result is only 31 milliseconds without plotting on this 2.5 GHz machine, without Intel-MKL

#include <iostream>
#include <vector>
#include <cmath>
#include <chrono>
#include <Eigen/Dense>
#include <omp.h>
#include <Eigen/Sparse>

double R = 10.0;
double L = 2.0;
double V0 = 5.0;
double t1 = 1.0;
double t2 = 2.0;
double t_start = 0.0;
double t_end = 3.0;

double exact_voltage(double t) {
    if (t < t1) {
        return 0;
    }
    else if (t1 <= t && t < t2) {
        return V0 * std::exp(-(R / L) * (t - t1));
    }
    else {
        return -V0 * std::exp(-(R / L) * (t - t2));
    }
}

void solve(double dt, std::vector<double>& solved_s_times, std::vector<double>& solved_V_L) {
    int N = static_cast<int>((t_end - t_start) / dt);
    using SpMat = Eigen::SparseMatrix<double>;

    SpMat A = SpMat(N, N);
    Eigen::VectorXd B = Eigen::VectorXd::Zero(N);

    for (int i = 0; i < N; ++i) {
        A.insert(i, i) = L / dt + R;
        if (i > 0) {
            A.insert(i, i - 1) = -L / dt;
        }
    }

    std::vector<double> solve_times(N);
    for (int i = 0; i < N; ++i) {
        solve_times[i] = t_start + i * dt;
        if (t1 <= solve_times[i] && solve_times[i] < t2) {
            B(i) = V0;
        }
        else if (solve_times[i] >= t2) {
            B(i) = 0.0;
        }
    }


    // Solving:
    Eigen::SimplicialCholesky<SpMat> chol(A);  // performs a Cholesky factorization of A
    Eigen::VectorXd I = chol.solve(B);         // use the factorization to solve for the given right hand side

    Eigen::MatrixXd A_dI_dt = Eigen::MatrixXd::Zero(N, N);
    for (int i = 0; i < N - 1; ++i) {
        A_dI_dt(i, i) = -1.0 / dt;
        A_dI_dt(i, i + 1) = 1.0 / dt;
    }

    Eigen::VectorXd dI_dt = A_dI_dt * I;

    solved_V_L.resize(N);
    solved_s_times.resize(N);

    for (int i = 0; i < N; ++i) {
        solved_V_L[i] = L * dI_dt[i];
        solved_s_times[i] = solve_times[i] + dt;
    }
}

int main() {

    std::cout << Eigen::nbThreads() << std::endl;
    std::vector<double> dt_values = { 0.002, 0.001, 0.004 };

    auto start = std::chrono::high_resolution_clock::now(); // Start pomiaru czasu

    for (double dt : dt_values) {
        std::vector<double> times, V_L;
        solve(dt, times, V_L);
    }

    auto end = std::chrono::high_resolution_clock::now(); // Koniec pomiaru czasu

    std::cout << "Time difference = " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "[ms]" << std::endl;

    std::vector<double> exact_times, exact_V_L;
    double dt_exact = 0.001;
    int N_exact = static_cast<int>((t_end - t_start) / dt_exact);

    exact_times.resize(N_exact);
    exact_V_L.resize(N_exact);

    for (int i = 0; i < N_exact; ++i) {
        exact_times[i] = t_start + i * dt_exact;
        exact_V_L[i] = exact_voltage(exact_times[i]);
    }

    return 0;
}

Time difference = 31[ms]

you can continue experimenting with other solvers or use Intel-MKL to speed up your computation with different drawbacks.