python numpy parallel-processing numba jit

Unexpected results with parallel numba jit

I'm trying to parallelize this numba jitted function. I initially thought this would be trivial since it is an embarrassingly parallel problem, but it produces unexpected results (different output for the sequential and parallel 'implementations'). Any idea what could be happening there and whether there are ways to make this work?

See reproducible example below:

import numba
import numpy as np
from sklearn.datasets import make_regression


@numba.jit(nopython=True)
def nanlstsq(X, y):
    """Return the least-squares solution to a linear matrix equation

    Analog to ``numpy.linalg.lstsq`` for dependant variable containing ``Nan``

    Args:
        X ((M, N) np.ndarray): Matrix of independant variables
        y ({(M,), (M, K)} np.ndarray): Matrix of dependant variables

    Returns:
        np.ndarray: Least-squares solution, ignoring ``Nan``
    """
    beta = np.zeros((X.shape[1], y.shape[1]), dtype=np.float64)
    for idx in range(y.shape[1]):
        # subset y and X
        isna = np.isnan(y[:,idx])
        X_sub = X[~isna]
        y_sub = y[~isna,idx]
        # Compute beta on data subset
        XTX = np.linalg.inv(np.dot(X_sub.T, X_sub))
        XTY = np.dot(X_sub.T, y_sub)
        beta[:,idx] = np.dot(XTX, XTY)
    return beta


@numba.jit(nopython=True, parallel=True)
def nanlstsq_parallel(X, y):
    beta = np.zeros((X.shape[1], y.shape[1]), dtype=np.float64)
    for idx in numba.prange(y.shape[1]):
        # subset y and X
        isna = np.isnan(y[:,idx])
        X_sub = X[~isna]
        y_sub = y[~isna,idx]
        # Compute beta on data subset
        XTX = np.linalg.inv(np.dot(X_sub.T, X_sub))
        XTY = np.dot(X_sub.T, y_sub)
        beta[:,idx] = np.dot(XTX, XTY)
    return beta




# Generate random data
n_targets = 10000
n_features = 3
X, y = make_regression(n_samples=200, n_features=n_features,
                       n_targets=n_targets)
# Add random nan to y array
y.ravel()[np.random.choice(y.size, 5*n_targets, replace=False)] = np.nan
# Run the regression
beta = nanlstsq(X, y)
beta_parallel = nanlstsq_parallel(X, y)


np.testing.assert_allclose(beta, beta_parallel)

Solution

Not sure what is causing the difference between the sequential and parallel versions, but computing the inverse of X_sub.T @ X_sub is not recommended for solving matrix equations - don't invert that matrix. If you still want to compute the normal equations then you should use numpy.linalg.solve or without computing the normal equations, you can use numpy.linalg.lstsq both of which are compatible with numba and pass your assert_allclose test with the sequential version:

@numba.jit(nopython=True, parallel=True)
def nanlstsq_parallel_solve(X, y):
    beta = np.zeros((X.shape[1], y.shape[1]), dtype=np.float64)
    for idx in numba.prange(y.shape[1]):
        # subset y and X
        isna = np.isnan(y[:,idx])
        X_sub = X[~isna]
        y_sub = y[~isna,idx]
        beta[:, idx] = np.linalg.solve(np.dot(X_sub.T, X_sub), np.dot(X_sub.T, y_sub))
    return beta


@numba.jit(nopython=True, parallel=True)
def nanlstsq_parallel_lstsq(X, y):
    beta = np.zeros((X.shape[1], y.shape[1]), dtype=np.float64)
    for idx in numba.prange(y.shape[1]):
        # subset y and X
        isna = np.isnan(y[:,idx])
        X_sub = X[~isna]
        y_sub = y[~isna,idx]
        beta[:, idx] = np.linalg.lstsq(X_sub, y_sub)[0]
    return beta


beta_parallel_solve = nanlstsq_parallel_solve(X, y)
beta_parallel_lstsq = nanlstsq_parallel_lstsq(X, y)


np.testing.assert_allclose(beta, beta_parallel_solve)
np.testing.assert_allclose(beta, beta_parallel_lstsq)