Search code examples
pythonregression

linear regression output are nonsensical


I have a dataset and am trying to fill in the missing values by utilizing a 2d regression to get the slope of the surrounding curves to approximate the missing value. I am not sure if this is the right approach here, but am open to listen to other ideas. However, here's my example:

local_window = pd.DataFrame({102.5: {0.021917: 0.0007808776581961896,
                                 0.030136: 0.0009108521507099643,
                                 0.035616: 0.001109650616093018,
                                 0.041095: 0.0013238862647034224,
                                 0.060273: 0.0018552410055933753},
                         105.0: {0.021917: 0.0008955896980595855,
                                 0.030136: 0.001003244315807649,
                                 0.035616: 0.0011852612740301449,
                                 0.041095: 0.0013952857530607904,
                                 0.060273: 0.0018525880756980716},
                         107.5: {0.021917: np.nan,
                                 0.030136: 0.0012354997955153118,
                                 0.035616: 0.00140044893559622,
                                 0.041095: 0.0015902024099268574,
                                 0.060273: 0.001973254493672934}})
def predict_nan_local(local_window):
    if not local_window.isnull().values.any():
        return local_window
    
    # Extract x and y values for the local window
    X_local = local_window.columns.values.copy()
    y_local = local_window.index.values.copy()

    # Create a meshgrid of x and y values
    X_local, y_local = np.meshgrid(X_local, y_local)

    # Flatten x and y for fitting the model
    X_local_flat = X_local.flatten()
    y_local_flat = y_local.flatten()
    values_local_flat = local_window.values.flatten()
    
    # Find indices of non-NaN values
    non_nan_indices = ~np.isnan(values_local_flat)

    # Filter out NaN values
    X_local_flat_filtered = X_local_flat[non_nan_indices]
    y_local_flat_filtered = y_local_flat[non_nan_indices]
    values_local_flat_filtered = values_local_flat[non_nan_indices]

    regressor = LinearRegression()
    regressor.fit(np.column_stack((X_local_flat_filtered, y_local_flat_filtered)), values_local_flat_filtered)
    
    nan_indices = np.argwhere(np.isnan(local_window.values))
    X_nan = local_window.columns.values[nan_indices[:, 1]]
    y_nan = local_window.index.values[nan_indices[:, 0]]
    
    # Predict missing value
    predicted_values = regressor.predict(np.column_stack((X_nan, y_nan)))

    local_window.iloc[nan_indices[:, 0], nan_indices[:, 1]] = predicted_values

    return local_window

enter image description here

The output - as you can see - doesn't make a whole lot of sense. Is there anything I am missing?


Solution

  • I made some modifications to your predict function:

    def predict_nan_local(local_window, degree=2):
        # Only proceed if there are NaNs to fill
        if not local_window.isnull().values.any():
            return local_window
        
        # Create a meshgrid of x and y values
        x = local_window.columns.values
        y = local_window.index.values
        X, Y = np.meshgrid(x, y)
        
        # Flatten the grid for fitting
        X_flat = X.ravel()
        Y_flat = Y.ravel()
        Z_flat = local_window.values.ravel()
        
        # Filter out NaN values
        valid_mask = ~np.isnan(Z_flat)
        X_valid = X_flat[valid_mask]
        Y_valid = Y_flat[valid_mask]
        Z_valid = Z_flat[valid_mask]
        
        # Create polynomial features
        poly = PolynomialFeatures(degree=degree)
        XY_poly = poly.fit_transform(np.column_stack((X_valid, Y_valid)))
        
        # Fit the model
        regressor = LinearRegression()
        regressor.fit(XY_poly, Z_valid)
        
        # Predict missing values
        XY_all_poly = poly.transform(np.column_stack((X_flat, Y_flat)))
        Z_pred_flat = regressor.predict(XY_all_poly)
        
        # Fill in the missing values
        Z_flat[~valid_mask] = Z_pred_flat[~valid_mask]
        filled_local_window = pd.DataFrame(Z_flat.reshape(local_window.shape), index=y, columns=x)
        
        return filled_local_window
    

    Running this on your data:

    predict_nan_local(local_window)
    
                   102.5       105.0       107.5
    0.021917    0.000781    0.000896    0.001089
    0.030136    0.000911    0.001003    0.001235
    0.035616    0.001110    0.001185    0.001400
    0.041095    0.001324    0.001395    0.001590
    0.060273    0.001855    0.001853    0.001973
    

    So we have imputed 0.001089 for the missing value. Plotting this:

    import matplotlib.pyplot as plt
    import numpy as np
    import pandas as pd
    
    # Assuming local_window is your DataFrame with the same structure as in your example
    local_window = pd.DataFrame({
        102.5: {0.021917: 0.0007808776581961896,
                0.030136: 0.0009108521507099643,
                0.035616: 0.001109650616093018,
                0.041095: 0.0013238862647034224,
                0.060273: 0.0018552410055933753},
        105.0: {0.021917: 0.0008955896980595855,
                0.030136: 0.001003244315807649,
                0.035616: 0.0011852612740301449,
                0.041095: 0.0013952857530607904,
                0.060273: 0.0018525880756980716},
        107.5: {0.021917: 0.001089,
                0.030136: 0.0012354997955153118,
                0.035616: 0.00140044893559622,
                0.041095: 0.0015902024099268574,
                0.060273: 0.001973254493672934}
    })
    
    # Transpose the DataFrame to plot each x-value (102.5, 105.0, 107.5) as a separate line
    local_window_transposed = local_window.T
    
    # Create the plot
    plt.figure(figsize=(10, 5))
    
    # Plot each column as a separate line
    for column in local_window_transposed.columns:
        plt.plot(local_window_transposed.index, local_window_transposed[column], marker='o', label=f'y={column}')
    
    # Find missing data point(s)
    for col in local_window_transposed.columns:
        missing_data = local_window_transposed[col].isnull()
        if missing_data.any():
            missing_x = local_window_transposed.index[missing_data]
            for mx in missing_x:
                plt.scatter(mx, local_window[col][mx], s=100, facecolors='none', edgecolors='r')
    
    # Add titles and labels
    plt.title('Chart Title')
    plt.xlabel('X-axis Label')
    plt.ylabel('Y-axis Label')
    plt.legend(title='Legend Title')
    
    # Show the plot
    plt.show()
    

    enter image description here

    Is this closer to what you expected ?