python performance time-complexity space-complexity

Improve computational time and memory usage of the calculation of a large Matrix that has four loops

I want to calculate a Matrix G that its elements is a scalar and are calculated as:

I want to calculated this matrix for a large n > 10000, d>30. My code is below but it has a huge overhead and it still takes very long time.

How can I make this computation at the fastest possible way? Without using GPU and Minimize the memory usage.

import numpy as np
from sklearn.gaussian_process.kernels import Matern
from tqdm import tqdm
from joblib import Parallel, delayed

# Pre-flattened computation to minimize data transfer overhead
def precompute_differences(R, Z):
    n, d        = R.shape
    R_diff_flat = (R[:, None, :] - R[None, :, :]).reshape(n * n, d)
    Z_diff      = Z[:, None, :] - Z[None, :, :]
    return R_diff_flat, Z_diff

def compute_G_row(i, R_diff_flat, Z_diff, W, gamma_val, kernel, n, d):
    """
    Compute the i-th row for j >= i and store them in a temporary array.
    """
    row_values = np.zeros(n)
    for j in range(i, n):
        Z_ij   = gamma_val * Z_diff[i, j].reshape(1, d)
        K_flat = kernel(R_diff_flat, Z_ij)
        K_ij   = K_flat.reshape(n, n)
        row_values[j] = np.sum(W * K_ij)
    return i, row_values

def compute_G(M, gamma, R, Z, nu=1.5, length_scale=1.0, use_parallel=True):
    """
    Compute the G matrix with fewer kernel evaluations by exploiting symmetry:
    G[i,j] = G[j,i]. We only compute for j >= i, then mirror the result.
    """
    R = np.asarray(R)
    Z = np.asarray(Z)
    M = np.asarray(M).reshape(-1, 1)  # ensure (n,1)
    n, d = R.shape

    # Precompute data
    R_diff_flat, Z_diff = precompute_differences(R, Z)
    W = M @ M.T  # Weight matrix

    G = np.zeros((n, n))

    kernel = Matern(length_scale=length_scale, nu=nu)

    if use_parallel and n > 1:
        # Parallel computation
        results = Parallel(n_jobs=-1)(
            delayed(compute_G_row)(i, R_diff_flat, Z_diff, W, gamma, kernel, n, d)
            for i in tqdm(range(n), desc="Computing G matrix")
        )
    else:
        # Single-threaded computation
        results = []
        for i in tqdm(range(n), desc="Computing G matrix"):
            row_values = np.zeros(n)
            for j in range(i, n):
                Z_ij   = gamma * Z_diff[i, j].reshape(1, d)
                K_flat = kernel(R_diff_flat, Z_ij)
                K_ij   = K_flat.reshape(n, n)
                row_values[j] = np.sum(W * K_ij)
            results.append((i, row_values))

    # Sort and fill final G by symmetry
    results.sort(key=lambda x: x[0])
    for i, row_vals in results:
        for j in range(i, n):
            G[i, j] = row_vals[j]
            G[j, i] = row_vals[j]  # mirror for symmetry

    # Delete auxiliary variables to save memory
    del R_diff_flat, Z_diff, W, kernel, results

    # Optional checks
    is_symmetric = np.allclose(G, G.T, atol=1e-8)
    eigenvalues = np.linalg.eigvalsh(G)
    is_semi_positive_definite = np.all(eigenvalues >= -1e-8)
    print(f"G is semi-positive definite: {is_semi_positive_definite}")
    print(f"G is symmetric: {is_symmetric}")

    # Delete all local auxiliary variables except G to save memory
    local_vars = list(locals().keys())
    for var_name in local_vars:
        if var_name not in ["G"]:
            del locals()[var_name]

    return G

Toy Example

# Example usage:
if __name__ == "__main__":
    __spec__ = None
    n = 20
    d = 10
    gamma = 0.9
    R = np.random.rand(n, d)
    Z = np.random.rand(n, d)
    M = np.random.rand(n, 1)

    G = compute_G(M, gamma, R, Z, nu=1.5, length_scale=1.0, use_parallel=True)
    print("G computed with shape:", G.shape)

Solution

UPDATE: So it looked that still the answer I wrote was not feasible enough. Leveraging the idea from @Onyymbu, here is the updated code that seems working well with very small memory usage.

import numpy as np
from sklearn.gaussian_process.kernels import Matern
from tqdm import tqdm
from joblib import Parallel, delayed
from scipy.spatial.distance import squareform

def G_Batchwise_optimized(M, gamma, R, Z, nu=1.5, length_scale=1.0, batch_size=100):
    """
    Optimized computation of G[i,j] = sum_{l,l'} M[l]*M[l'] * k(r[l] + gamma*z[i], r[l'] + gamma*z[j]).

    Reduces memory overhead by computing pairwise differences on the fly.
           """
    Z = np.array(Z)
    R = np.array(R)
    kernel = Matern(length_scale=length_scale, nu=nu)
    n = R.shape[0]
    mm = M.ravel()

    ### We Generate unique i, j pairs to avoid redundant calculations ###
    a = np.arange(n - 1)
    b = np.arange(n - 1, 0, -1)
    i = np.repeat(a, b)
    j = np.concatenate([np.arange(i, n) for i in a + 1])

    m = mm[i] * mm[j]  # Weight products

    ### Compute diagonal elements first (where RD = 0, and where ZD = 0) ###
    zero = np.zeros((1, R.shape[1]))  # Placeholder for 0-difference
    diagR = kernel(gamma * (Z[i] - Z[j]), zero).ravel()  # R == 0
    diagZ = kernel(zero, R[i] - R[j])  # Z == 0
    diagM = mm @ mm  # Sum over all M elements

    ### This is Function to process a batch of kernel evaluations ###
    def process_batch(start_idx, end_idx):
        """Computes kernel for a batch of pairwise differences."""
        RD_batch = R[i[start_idx:end_idx]] - R[j[start_idx:end_idx]]
        ZD_batch = gamma * (Z[i[start_idx:end_idx]] - Z[j[start_idx:end_idx]])

        ### Compute kernel for both directions ###
        kernel_vals = kernel(RD_batch, ZD_batch) + kernel(RD_batch, -ZD_batch)

        ### Sum weighted kernel values ###
        return m[start_idx:end_idx] @ kernel_vals

    ### Compute off-diagonal elements in batches to reduce memory usage ###
    num_pairs = len(i)
    num_batches = (num_pairs + batch_size - 1) // batch_size  # Ensure full coverage
    results = Parallel(n_jobs=-1)(
        delayed(process_batch)(start, min(start + batch_size, num_pairs))
        for start in tqdm(range(0, num_pairs, batch_size), desc="Computing G matrix")
    )

    ### Construct the G matrix ###
    G = squareform(np.concatenate(results) + diagR * diagM)

    ### Fill diagonal elements ###
    G[np.diag_indices_from(G)] = 2 * diagZ @ m + diagM

    ### Optional validation checks. ###
    is_symmetric = np.allclose(G, G.T, atol=1e-8)
    eigenvalues = np.linalg.eigvalsh(G)
    is_semi_positive_definite = np.all(eigenvalues >= -1e-8)
    print(f"G is semi-positive definite: {is_semi_positive_definite}")
    print(f"G is symmetric: {is_symmetric}")

    return G

and toy example:

#%%
# Example usage:
if __name__ == "__main__":
    n = 500  # Large-scale test
    d = 10
    gamma = 0.9
    R = np.random.rand(n, d)
    Z = np.random.rand(n, d)
    M = np.random.rand(n)

    G1_optimized = G_Batchwise_optimized(M, gamma, R, Z, nu=1.5, length_scale=1.0, batch_size=1000)
    print(f"G computed with shape: {G1_optimized.shape}, Memory usage: {G1_optimized.nbytes / 1e6:.1f} MB")