What I am playing around with right now is to work with PyTorch within a pipeline, where all of the preprocessing will be handled.
I am able to make it work. However, the results I am getting are a bit off. The loss function seems to be not decreasing and gets stuck (presumably in local optima?) as the training loop progresses.
I follow the standard PyTorch training loop and wrap it inside the fit method as this is what sklearn wants:
import torch
from sklearn.base import BaseEstimator, TransformerMixin
import torch.nn.functional as F
from IPython.core.debugger import set_trace
# +
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
import random
# -
df = sns.load_dataset("tips")
df.head()
# +
class LinearRegressionModel(torch.nn.Module, BaseEstimator, TransformerMixin):
def __init__(self, loss_func = torch.nn.MSELoss()):
super(LinearRegressionModel, self).__init__()
self.linear = torch.nn.Linear(3, 1) # One in and one out
self.loss_func = loss_func
self.optimizer = torch.optim.SGD(self.parameters(), lr = 0.01)
def forward(self, x):
y_pred = F.relu(self.linear(x))
return y_pred
def fit(self, X, y):
# set_trace()
X = torch.from_numpy(X.astype(np.float32))
y = torch.from_numpy(y.values.astype(np.float32))
for epoch in tqdm(range(0, 12)):
pred_y = self.forward(X)
# Compute and print loss
loss = self.loss_func(pred_y, X)
# Zero gradients, perform a backward pass,
# and update the weights.
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
print('epoch {}, loss {}'.format(epoch, loss.item()))
# +
from sklearn.pipeline import Pipeline
from sklego.preprocessing import PatsyTransformer
# -
my_model = LinearRegressionModel()
pipe = Pipeline([
("patsy", PatsyTransformer("tip + size")),
("model", my_model)
])
pipe.fit(df, df['total_bill'])
It is not only due to the model being to simple. If I use sklearn linear regression estimated via stochastic gradient descent (SGDRegressor) the results seem nice. Therefore, I am concluding that problem is within my PyTorch class
# +
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
pipe2 = Pipeline([
("patsy", PatsyTransformer("tip + C(size) + C(time)")),
("model", LinearRegression())
])
pipe2.fit(df, df['total_bill'])
# -
mean_squared_error(df['total_bill'], pipe2.predict(df))
The problem in this implementation is in the fit
method.
We are comparing prediction and design matrix
# Compute and print loss
loss = self.loss_func(pred_y, X)
Should be prediction and real value y:
loss = self.loss_func(pred_y, y)