I have a dataset and am trying to fill in the missing values by utilizing a 2d regression to get the slope of the surrounding curves to approximate the missing value. I am not sure if this is the right approach here, but am open to listen to other ideas. However, here's my example:
local_window = pd.DataFrame({102.5: {0.021917: 0.0007808776581961896,
0.030136: 0.0009108521507099643,
0.035616: 0.001109650616093018,
0.041095: 0.0013238862647034224,
0.060273: 0.0018552410055933753},
105.0: {0.021917: 0.0008955896980595855,
0.030136: 0.001003244315807649,
0.035616: 0.0011852612740301449,
0.041095: 0.0013952857530607904,
0.060273: 0.0018525880756980716},
107.5: {0.021917: np.nan,
0.030136: 0.0012354997955153118,
0.035616: 0.00140044893559622,
0.041095: 0.0015902024099268574,
0.060273: 0.001973254493672934}})
def predict_nan_local(local_window):
if not local_window.isnull().values.any():
return local_window
# Extract x and y values for the local window
X_local = local_window.columns.values.copy()
y_local = local_window.index.values.copy()
# Create a meshgrid of x and y values
X_local, y_local = np.meshgrid(X_local, y_local)
# Flatten x and y for fitting the model
X_local_flat = X_local.flatten()
y_local_flat = y_local.flatten()
values_local_flat = local_window.values.flatten()
# Find indices of non-NaN values
non_nan_indices = ~np.isnan(values_local_flat)
# Filter out NaN values
X_local_flat_filtered = X_local_flat[non_nan_indices]
y_local_flat_filtered = y_local_flat[non_nan_indices]
values_local_flat_filtered = values_local_flat[non_nan_indices]
regressor = LinearRegression()
regressor.fit(np.column_stack((X_local_flat_filtered, y_local_flat_filtered)), values_local_flat_filtered)
nan_indices = np.argwhere(np.isnan(local_window.values))
X_nan = local_window.columns.values[nan_indices[:, 1]]
y_nan = local_window.index.values[nan_indices[:, 0]]
# Predict missing value
predicted_values = regressor.predict(np.column_stack((X_nan, y_nan)))
local_window.iloc[nan_indices[:, 0], nan_indices[:, 1]] = predicted_values
return local_window
The output - as you can see - doesn't make a whole lot of sense. Is there anything I am missing?
I made some modifications to your predict function:
def predict_nan_local(local_window, degree=2):
# Only proceed if there are NaNs to fill
if not local_window.isnull().values.any():
return local_window
# Create a meshgrid of x and y values
x = local_window.columns.values
y = local_window.index.values
X, Y = np.meshgrid(x, y)
# Flatten the grid for fitting
X_flat = X.ravel()
Y_flat = Y.ravel()
Z_flat = local_window.values.ravel()
# Filter out NaN values
valid_mask = ~np.isnan(Z_flat)
X_valid = X_flat[valid_mask]
Y_valid = Y_flat[valid_mask]
Z_valid = Z_flat[valid_mask]
# Create polynomial features
poly = PolynomialFeatures(degree=degree)
XY_poly = poly.fit_transform(np.column_stack((X_valid, Y_valid)))
# Fit the model
regressor = LinearRegression()
regressor.fit(XY_poly, Z_valid)
# Predict missing values
XY_all_poly = poly.transform(np.column_stack((X_flat, Y_flat)))
Z_pred_flat = regressor.predict(XY_all_poly)
# Fill in the missing values
Z_flat[~valid_mask] = Z_pred_flat[~valid_mask]
filled_local_window = pd.DataFrame(Z_flat.reshape(local_window.shape), index=y, columns=x)
return filled_local_window
Running this on your data:
predict_nan_local(local_window)
102.5 105.0 107.5
0.021917 0.000781 0.000896 0.001089
0.030136 0.000911 0.001003 0.001235
0.035616 0.001110 0.001185 0.001400
0.041095 0.001324 0.001395 0.001590
0.060273 0.001855 0.001853 0.001973
So we have imputed 0.001089 for the missing value. Plotting this:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# Assuming local_window is your DataFrame with the same structure as in your example
local_window = pd.DataFrame({
102.5: {0.021917: 0.0007808776581961896,
0.030136: 0.0009108521507099643,
0.035616: 0.001109650616093018,
0.041095: 0.0013238862647034224,
0.060273: 0.0018552410055933753},
105.0: {0.021917: 0.0008955896980595855,
0.030136: 0.001003244315807649,
0.035616: 0.0011852612740301449,
0.041095: 0.0013952857530607904,
0.060273: 0.0018525880756980716},
107.5: {0.021917: 0.001089,
0.030136: 0.0012354997955153118,
0.035616: 0.00140044893559622,
0.041095: 0.0015902024099268574,
0.060273: 0.001973254493672934}
})
# Transpose the DataFrame to plot each x-value (102.5, 105.0, 107.5) as a separate line
local_window_transposed = local_window.T
# Create the plot
plt.figure(figsize=(10, 5))
# Plot each column as a separate line
for column in local_window_transposed.columns:
plt.plot(local_window_transposed.index, local_window_transposed[column], marker='o', label=f'y={column}')
# Find missing data point(s)
for col in local_window_transposed.columns:
missing_data = local_window_transposed[col].isnull()
if missing_data.any():
missing_x = local_window_transposed.index[missing_data]
for mx in missing_x:
plt.scatter(mx, local_window[col][mx], s=100, facecolors='none', edgecolors='r')
# Add titles and labels
plt.title('Chart Title')
plt.xlabel('X-axis Label')
plt.ylabel('Y-axis Label')
plt.legend(title='Legend Title')
# Show the plot
plt.show()
Is this closer to what you expected ?