linear regression output are nonsensical

I have a dataset and am trying to fill in the missing values by utilizing a 2d regression to get the slope of the surrounding curves to approximate the missing value. I am not sure if this is the right approach here, but am open to listen to other ideas. However, here's my example:

local_window = pd.DataFrame({102.5: {0.021917: 0.0007808776581961896,
                                 0.030136: 0.0009108521507099643,
                                 0.035616: 0.001109650616093018,
                                 0.041095: 0.0013238862647034224,
                                 0.060273: 0.0018552410055933753},
                         105.0: {0.021917: 0.0008955896980595855,
                                 0.030136: 0.001003244315807649,
                                 0.035616: 0.0011852612740301449,
                                 0.041095: 0.0013952857530607904,
                                 0.060273: 0.0018525880756980716},
                         107.5: {0.021917: np.nan,
                                 0.030136: 0.0012354997955153118,
                                 0.035616: 0.00140044893559622,
                                 0.041095: 0.0015902024099268574,
                                 0.060273: 0.001973254493672934}})

def predict_nan_local(local_window):
    if not local_window.isnull().values.any():
        return local_window
    
    # Extract x and y values for the local window
    X_local = local_window.columns.values.copy()
    y_local = local_window.index.values.copy()

    # Create a meshgrid of x and y values
    X_local, y_local = np.meshgrid(X_local, y_local)

    # Flatten x and y for fitting the model
    X_local_flat = X_local.flatten()
    y_local_flat = y_local.flatten()
    values_local_flat = local_window.values.flatten()
    
    # Find indices of non-NaN values
    non_nan_indices = ~np.isnan(values_local_flat)

    # Filter out NaN values
    X_local_flat_filtered = X_local_flat[non_nan_indices]
    y_local_flat_filtered = y_local_flat[non_nan_indices]
    values_local_flat_filtered = values_local_flat[non_nan_indices]

    regressor = LinearRegression()
    regressor.fit(np.column_stack((X_local_flat_filtered, y_local_flat_filtered)), values_local_flat_filtered)
    
    nan_indices = np.argwhere(np.isnan(local_window.values))
    X_nan = local_window.columns.values[nan_indices[:, 1]]
    y_nan = local_window.index.values[nan_indices[:, 0]]
    
    # Predict missing value
    predicted_values = regressor.predict(np.column_stack((X_nan, y_nan)))

    local_window.iloc[nan_indices[:, 0], nan_indices[:, 1]] = predicted_values

    return local_window

The output - as you can see - doesn't make a whole lot of sense. Is there anything I am missing?

Solution

I made some modifications to your predict function:

def predict_nan_local(local_window, degree=2):
    # Only proceed if there are NaNs to fill
    if not local_window.isnull().values.any():
        return local_window
    
    # Create a meshgrid of x and y values
    x = local_window.columns.values
    y = local_window.index.values
    X, Y = np.meshgrid(x, y)
    
    # Flatten the grid for fitting
    X_flat = X.ravel()
    Y_flat = Y.ravel()
    Z_flat = local_window.values.ravel()
    
    # Filter out NaN values
    valid_mask = ~np.isnan(Z_flat)
    X_valid = X_flat[valid_mask]
    Y_valid = Y_flat[valid_mask]
    Z_valid = Z_flat[valid_mask]
    
    # Create polynomial features
    poly = PolynomialFeatures(degree=degree)
    XY_poly = poly.fit_transform(np.column_stack((X_valid, Y_valid)))
    
    # Fit the model
    regressor = LinearRegression()
    regressor.fit(XY_poly, Z_valid)
    
    # Predict missing values
    XY_all_poly = poly.transform(np.column_stack((X_flat, Y_flat)))
    Z_pred_flat = regressor.predict(XY_all_poly)
    
    # Fill in the missing values
    Z_flat[~valid_mask] = Z_pred_flat[~valid_mask]
    filled_local_window = pd.DataFrame(Z_flat.reshape(local_window.shape), index=y, columns=x)
    
    return filled_local_window

Running this on your data:

predict_nan_local(local_window)

               102.5       105.0       107.5
0.021917    0.000781    0.000896    0.001089
0.030136    0.000911    0.001003    0.001235
0.035616    0.001110    0.001185    0.001400
0.041095    0.001324    0.001395    0.001590
0.060273    0.001855    0.001853    0.001973

So we have imputed 0.001089 for the missing value. Plotting this:

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Assuming local_window is your DataFrame with the same structure as in your example
local_window = pd.DataFrame({
    102.5: {0.021917: 0.0007808776581961896,
            0.030136: 0.0009108521507099643,
            0.035616: 0.001109650616093018,
            0.041095: 0.0013238862647034224,
            0.060273: 0.0018552410055933753},
    105.0: {0.021917: 0.0008955896980595855,
            0.030136: 0.001003244315807649,
            0.035616: 0.0011852612740301449,
            0.041095: 0.0013952857530607904,
            0.060273: 0.0018525880756980716},
    107.5: {0.021917: 0.001089,
            0.030136: 0.0012354997955153118,
            0.035616: 0.00140044893559622,
            0.041095: 0.0015902024099268574,
            0.060273: 0.001973254493672934}
})

# Transpose the DataFrame to plot each x-value (102.5, 105.0, 107.5) as a separate line
local_window_transposed = local_window.T

# Create the plot
plt.figure(figsize=(10, 5))

# Plot each column as a separate line
for column in local_window_transposed.columns:
    plt.plot(local_window_transposed.index, local_window_transposed[column], marker='o', label=f'y={column}')

# Find missing data point(s)
for col in local_window_transposed.columns:
    missing_data = local_window_transposed[col].isnull()
    if missing_data.any():
        missing_x = local_window_transposed.index[missing_data]
        for mx in missing_x:
            plt.scatter(mx, local_window[col][mx], s=100, facecolors='none', edgecolors='r')

# Add titles and labels
plt.title('Chart Title')
plt.xlabel('X-axis Label')
plt.ylabel('Y-axis Label')
plt.legend(title='Legend Title')

# Show the plot
plt.show()

Is this closer to what you expected ?