I have written my own CustomClassifier which binarizes the dependent variable. This is the code
class OwnClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, estimator=None):
self.yt = None
if estimator is None:
estimator = LogisticRegression(solver='liblinear')
self.estimator = estimator
self.discr = KBinsDiscretizer(n_bins=4, encode='ordinal')
def fit(self, X, y):
self.yt = y.copy()
self.yt = self.discr.fit_transform(self.yt.reshape(-1, 1)).astype(int)
return self
def predict(self, X):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y=None):
return accuracy_score(self.yt, self.predict(X))
When using GridSearchCV on it, it throws an error:
grid = [{'estimator__C': [1, 10, 100, 1000]}]
myLogi = OwnClassifier()
gridCv = GridSearchCV(myLogi, grid)
gridCv.fit(X, y)
How can the classifier be compatible with GridSearchCV?
I use the Boston Housing data
boston_data = load_boston()
X = boston_data['data']
y = boston_data['target']
ValueError: Found input variables with inconsistent numbers of samples: [404, 102]
The problem is in the score method, as you are forcing it to always use the training data self.yt
to calculate the accuracy, which is why the traceback says that the shapes are incompatible. This has been fixed in the code below:
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_boston
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import GridSearchCV
class OwnClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, estimator=None):
if estimator is None:
estimator = LogisticRegression(solver='liblinear')
self.estimator = estimator
self.discr = KBinsDiscretizer(n_bins=4, encode='ordinal')
def fit(self, X, y):
# fit the discretizer
self.discr.fit(y.reshape(-1, 1))
# transform the target
yt = self.discr.transform(y.reshape(-1, 1)).astype(int).ravel()
# fit the model
self.estimator.fit(X, yt)
def predict(self, X):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
# transform the target using the fitted discretizer
yt = self.discr.transform(y.reshape(-1, 1)).astype(int).ravel()
# calculate the accuracy using the fitted model
return accuracy_score(yt, self.predict(X))
boston_data = load_boston()
X = boston_data['data']
y = boston_data['target']
grid = [{'estimator__C': [1, 10, 100, 1000]}]
myLogi = OwnClassifier()
gridCv = GridSearchCV(myLogi, grid)
gridCv.fit(X, y)