python machine-learning scikit-learn google-colaboratory grid-search

Grid search not giving the best parameters

When running grid search on Inverse of regularization strength parameter and number of nearest neighbors parameter for logistic regression , linear SVM and K nearest neighbors classifier , The best parameters obtained from gridsearch are not really the best when verifying manually by training on same training data set. Code below

# Convert to a DataFrame.
import pandas as pd
from sklearn.datasets import fetch_openml

df = fetch_openml('credit-g', as_frame=True).frame
df.head(5)

df.dtypes

import matplotlib.pyplot as plt

fig = plt.figure(figsize=(12, 12))
st = fig.suptitle("univariate distributions and target distribution", fontsize=20)

# Using columns that we need for this plot
nfeatures = df[['duration', 'credit_amount' , 'age']]
target = df['class']

# creating 4x4 grid
grid = plt.GridSpec(4, 4, hspace=0.4, wspace=0.4)

# creating the normal plots in grid 1 , 2 ,3 and 4
p1 = fig.add_subplot(grid[:2,:2])
p2 = fig.add_subplot(grid[:2,2:])
p3 = fig.add_subplot(grid[2:,:2])
p4 = fig.add_subplot(grid[2:,2:])

p1.hist(nfeatures['duration'])
p2.hist(nfeatures['credit_amount'])
p3.hist(nfeatures['age'])
p4.hist(target)

p1.set_xlabel('duration')
p2.set_xlabel('credit_amount')
p3.set_xlabel('age')
p4.set_xlabel('class')
# customizing to look neat
st.set_y(0.95)
fig.subplots_adjust(top=0.92)


from sklearn.model_selection import train_test_split

columns = [column for column  in df.columns if column != 'class']
X = df[columns]
y = df['class']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 ,random_state=11)
#X_train , y_train , X_valid , y_valid = train_test_split(X,) 
# basic preprocessing on train sets
# numeric_columns = ['duration','credit_amount' , 'installment_commitment' , 'residence_since' , 'age' ,'existing_credits' , 'num_dependents' ]
numeric_columns = df.select_dtypes(include=['float64']).columns
categorical_columns = [column for column in columns if column not in numeric_columns]
temp = X_train[categorical_columns]
X_train_ohe = pd.concat([pd.get_dummies(temp),X_train[numeric_columns]],axis=1)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
lr = LogisticRegression(max_iter=1000)

cr = cross_val_score(lr,X_train_ohe,y_train)

print(cr)


from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

# define the data preparation for the categorical columns
t1 = [('cat', OneHotEncoder(), categorical_columns)]
col_transform = ColumnTransformer(transformers=t1)
# define the models
models = {'lr_model':LogisticRegression(max_iter=1000), 'lsvm_model':LinearSVC(max_iter=2500) , 'knn_model':KNeighborsClassifier()}

for name,model in models.items():
  # define the data preparation and modeling pipeline
  pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)])
  # define the model cross-validation configuration
  #cv = KFold(n_splits=10, shuffle=True, random_state=1)
  # evaluate the pipeline using cross validation and calculate MAE
  score = cross_val_score(pipeline, X_train, y_train)
  print(name ,score.mean())

# define the data preparation for the categorical columns and numeric columns
t2 = [('cat', OneHotEncoder(), categorical_columns), ('num', StandardScaler(), numeric_columns)]
col_transform = ColumnTransformer(transformers=t2)
# try with new column transformer
for name,model in models.items():
  # define the data preparation and modeling pipeline
  pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)])
  # define the model cross-validation configuration
  #cv = KFold(n_splits=10, shuffle=True, random_state=1)
  # evaluate the pipeline using cross validation and calculate MAE
  score = cross_val_score(pipeline, X_train, y_train)
  print(name ,score.mean())

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer

f1_scorer = make_scorer(f1_score, pos_label="bad")

# 'prep__num__with_mean': [True, False],
# 'prep__num__with_std': [True, False],
param_grid = {
    'm__C': [0.1, 1.0 , 0.01],
    }

param_grid_knn = {
    'm__n_neighbors': [5, 10 , 15],
    }

for name,model in models.items():
  # define the data preparation and modeling pipeline
  pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)])
  # define the model cross-validation configuration
  #cv = KFold(n_splits=10, shuffle=True, random_state=1)
  # evaluate the pipeline using cross validation and calculate MAE
  if name == 'knn_model':
      grid_clf = GridSearchCV(pipeline, param_grid_knn, cv=5, scoring=f1_scorer )
  else:
      grid_clf = GridSearchCV(pipeline, param_grid, cv=5, scoring=f1_scorer)
  grid_clf.fit(X_train, y_train)
  print(name,grid_clf.best_params_)
  print(name, grid_clf.best_estimator_.score(X_test, y_test))

lr_array = []
lr_c = [0.01,0.1,1]

for c in lr_c:
  pipeline = Pipeline(steps=[('prep',col_transform), ('m', LogisticRegression(max_iter=1000, C=c))])
  pipeline.fit(X_train,y_train)
  y_hat = pipeline.predict(X_train)
  lr_array.append(f1_score(y_train,y_hat,pos_label="bad"))


lsvm_array = []
lsvm_c = [0.01,0.1,1]

for c in lsvm_c:
  pipeline = Pipeline(steps=[('prep',col_transform), ('m', LinearSVC(dual=True,max_iter=2500,C=c))])
  pipeline.fit(X_train,y_train)
  y_hat = pipeline.predict(X_train)
  lsvm_array.append(f1_score(y_train,y_hat,pos_label="bad"))


knn_array = []
knn_n = [5,10,15]

for n in knn_n:
  pipeline = Pipeline(steps=[('prep',col_transform), ('m', KNeighborsClassifier(n_neighbors=n))])
  pipeline.fit(X_train,y_train)
  y_hat = pipeline.predict(X_train)
  knn_array.append(f1_score(y_train,y_hat,pos_label="bad"))

fig = plt.figure(figsize=(12, 12))
# creating 3x1 grid
grid = plt.GridSpec(3, 1, hspace=0.4, wspace=0.4)

# creating the normal plots in grid 1 , 2 ,3
p1 = fig.add_subplot(grid[0,:])
p2 = fig.add_subplot(grid[1,:])
p3 = fig.add_subplot(grid[2,:])

p1.scatter(lr_c,lr_array)
p2.scatter(lsvm_c,lsvm_array)
p3.scatter(knn_n,knn_array)

The trend changes when using different scores and evaluating on test set instead train set but the best parameters never seem to be same for grid search and manual verification . What could be the reason for this? For example if you run the above code grid search tells you 10 is the best value for n_neighbors but the graph at the end shows 5 does better .Is the comparison not being implemented correctly ? You can check the runs with output at this link https://github.com/binodmathews93/AppliedMachineLearningCourse/blob/master/Applied_Machine_Learning_Homework_2.ipynb

Solution

Hyperparameter tuning is performed on the validation (development) set, not on the training set.

Grid Search Cross-Validation is using the K-Fold strategy to build a validation set that is used only for validation, not for training.

You are manually performing training and validation on the same set which is an incorrect approach.

pipeline = Pipeline(steps=[('prep',col_transform), ('m', LogisticRegression(max_iter=1000, C=c))])
pipeline.fit(X_train,y_train)       # <- here is the problem
y_hat = pipeline.predict(X_train)
lr_array.append(f1_score(y_train,y_hat,pos_label="bad"))

This will only lead to hyperparameter choices that will boost performance on the training set which is not what you want (you what a set of hyperparameters that lead to good performance on the test set - that generalizes well).

This is the reason why the K (in KNN) is lower when you are doing the manual testing - lower K leads to less "regularization" and therefore is optimal, although incorrect, choice from the perspective of the training set.

If you want to manually verify the results, you will need to build the validation set by yourself (and don't use it during the training), or you will need to manually call K-fold cross-validation procedure.