Search code examples
pythontensorflowmachine-learningkerasneural-network

How to integrate keras model with sequential backward selection code?


I am trying to integrate a Keras deep neural network as a classifier within code for sequential backward feature selection in Python. (Originally, I tried to wrap the Keras deep neural network within Scikeras to use within scikit-learn's built in sequential feature selection models, but I kept getting error messages).

I found this code from scratch for sequential backward feature selection (taken from https://vitalflux.com/sequential-backward-feature-selection-python-example/), and have been trying to integrate a Keras model in the code to replace the "estimator" within the function but I keep getting this error: ValueError: Input 0 of layer "sequential_410" is incompatible with the layer: expected shape=(None, 45), found shape=(None, 44)

Here is the code that I have so far for the sequential backward feature selection and the deep neural network:

import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier, KerasRegressor

# SBS (sequential backward feature selection) from scratch
#=====================================================
from sklearn.metrics import accuracy_score
from itertools import combinations
from sklearn.base import clone

 
class SequentialBackwardSearch():
    '''
    Instantiate with Estimator and given number of features
    '''
    def __init__(self, estimator, k_features):
        self.estimator = clone(estimator)
        self.k_features = k_features
         
    '''
    X_train - Training data Pandas dataframe
    X_test - Test data Pandas dataframe
    y_train - Training label Pandas dataframe
    y_test - Test data Pandas dataframe
    '''
    def fit(self, X_train, X_test, y_train, y_test):
        dim = X_train.shape[1]
        self.indices_ = tuple(range(dim))
        self.subsets_ = [self.indices_]
        score = self._calc_score(X_train.values, X_test.values,
                                 y_train.values, y_test.values, self.indices_)
        self.scores_ = [score]
        '''
        Iterate through all the dimensions until k_features is reached
        At the end of loop, dimension count is reduced by 1
        '''
        while dim > k_features:
            scores = []
            subsets = []
            '''
            Iterate through different combinations of features, train the model,
            record the score
            '''
            for p in combinations(self.indices_, r=dim - 1):
                score = self._calc_score(X_train.values, X_test.values, y_train.values, y_test.values, p)
                scores.append(score)
                subsets.append(p)
            #
            # Get the index of best score
            #
            best_score_index = np.argmax(scores)
            #
            # Record the best score
            #
            self.scores_.append(scores[best_score_index])
            #
            # Get the indices of features which gave best score
            #
            self.indices_ = subsets[best_score_index]
            #
            # Record the indices of features for best score
            #
            self.subsets_.append(self.indices_)
            dim -= 1 # Dimension is reduced by 1
     
    '''
    Transform training, test data set to the data set
    havng features which gave best score
    '''
    def transform(self, X):
        return X.values[:, self.indices_]
     
    '''
    Train models with specific set of features
    indices - indices of features
    '''
    def _calc_score(self, X_train, X_test, y_train, y_test, indices):
        self.estimator.fit(X_train[:, indices], y_train.ravel())
        y_pred = self.estimator.predict(X_test[:, indices])
        score = accuracy_score(y_test, y_pred)
        return score


# ===============================================
# Keras deep neural network

def dnn():
    model = keras.Sequential([
    layers.Dense(20, activation='relu', input_shape = (X_train.shape[1])),
    layers.Dropout(0.3),
    layers.Dense(20, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid'),
])

    model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy']
)
    early_stopping = keras.callbacks.EarlyStopping(
        patience=5,
        min_delta=0.001,
        restore_best_weights=True,
)
    
    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        batch_size=512,
        callbacks=[early_stopping],
)
    history_df = pd.DataFrame(history.history)
    print("Minimum Validation Loss: {:0.4f}".format(history_df['val_loss'].min()));
    history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
    history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Accuracy")
    
    return model

keras_clf = KerasClassifier(dnn,
                            epochs=5,
                            verbose=False)

keras_clf._estimator_type = "classifier"



And this is the code I have for trying to integrate them together:

k_features = 5
#
# Instantiate SequentialBackwardSearch
#
sbs = SequentialBackwardSearch(keras_clf, k_features)
#
# Fit the data to determine the k_features which give the
# most optimal model performance
#
sbs.fit(X_train, X_test, y_train, y_test)
#
# Transform the training data set to dataset having k_features
# giving most optimal model performance
#
X_train_kfeatures = sbs.transform(X_train)
#
# Transform the test data set to dataset having k_features
#
X_test_kfeatures = sbs.transform(X_test)

sbs.indices_
X_train.columns[[sbs.indices_]] # sbs is an instance of SequentialBackwardSearch class

I am wondering whether this is even possible (integrating a neural network to the existing code for sequential backward feature selection) or if there's anything I can do to get it to run and output the top 5 features from the training dataset. I have tried to address the error message by altering the input shape of the neural network, but I believe it is correct (45 features). Any help or advice would be welcome!


Solution

  • This should work with SciKeras!

    I had to clean up your code / fix some bugs. I first did a "sanity check" using Scikit-Learn's MLPClassfier, then I ran it against an MLPClassfier created using Keras. Details may differ for more complex model architectures, but this shows that it does work.

    import numpy as np
    
    # SBS (sequential backward feature selection) from scratch
    #=====================================================
    from sklearn.metrics import accuracy_score
    from itertools import combinations
    from sklearn.base import clone
    
     
    class SequentialBackwardSearch:
        '''
        Instantiate with Estimator and given number of features
        '''
        def __init__(self, estimator, k_features):
            self.estimator = clone(estimator)
            self.k_features = k_features
             
        '''
        X_train - Training data Pandas dataframe
        X_test - Test data Pandas dataframe
        y_train - Training label Pandas dataframe
        y_test - Test data Pandas dataframe
        '''
        def fit(self, X_train, X_test, y_train, y_test):
            dim = X_train.shape[1]
            self.indices_ = tuple(range(dim))
            self.subsets_ = [self.indices_]
            score = self._calc_score(X_train, X_test,
                                     y_train, y_test, self.indices_)
            self.scores_ = [score]
            '''
            Iterate through all the dimensions until k_features is reached
            At the end of loop, dimension count is reduced by 1
            '''
            while dim > self.k_features:
                scores = []
                subsets = []
                '''
                Iterate through different combinations of features, train the model,
                record the score
                '''
                for p in combinations(self.indices_, r=dim - 1):
                    score = self._calc_score(X_train, X_test, y_train, y_test, p)
                    scores.append(score)
                    subsets.append(p)
                #
                # Get the index of best score
                #
                best_score_index = np.argmax(scores)
                #
                # Record the best score
                #
                self.scores_.append(scores[best_score_index])
                #
                # Get the indices of features which gave best score
                #
                self.indices_ = subsets[best_score_index]
                #
                # Record the indices of features for best score
                #
                self.subsets_.append(self.indices_)
                dim -= 1 # Dimension is reduced by 1
         
        '''
        Transform training, test data set to the data set
        havng features which gave best score
        '''
        def transform(self, X):
            return X.values[:, self.indices_]
         
        '''
        Train models with specific set of features
        indices - indices of features
        '''
        def _calc_score(self, X_train, X_test, y_train, y_test, indices):
            self.estimator.fit(X_train[:, indices], y_train.ravel())
            y_pred = self.estimator.predict(X_test[:, indices])
            score = accuracy_score(y_test, y_pred)
            return score
    
    # Sklearn MLPClassifier
    
    from sklearn.neural_network import MLPClassifier
    
    estimator = MLPClassifier()
    
    search = SequentialBackwardSearch(estimator, 1)
    
    X = np.random.randint(0, 2, size=(100, 5))
    y = X[:, -1]
    
    search.fit(X, X, y, y)
    
    assert list(search.indices_) == [4]
    
    # SciKeras MLPClassifier
    # see https://www.adriangb.com/scikeras/stable/notebooks/MLPClassifier_MLPRegressor.html
    
    import tensorflow.keras as keras
    from scikeras.wrappers import KerasClassifier
    
    class KerasMLPClassifier(KerasClassifier):
    
        def __init__(
            self,
            hidden_layer_sizes=(100, ),
            optimizer="adam",
            optimizer__learning_rate=0.001,
            epochs=200,
            verbose=0,
            **kwargs,
        ):
            super().__init__(**kwargs)
            self.hidden_layer_sizes = hidden_layer_sizes
            self.optimizer = optimizer
            self.epochs = epochs
            self.verbose = verbose
    
        def _keras_build_fn(self, compile_kwargs):
            model = keras.Sequential()
            inp = keras.layers.Input(shape=(self.n_features_in_))
            model.add(inp)
            for hidden_layer_size in self.hidden_layer_sizes:
                layer = keras.layers.Dense(hidden_layer_size, activation="relu")
                model.add(layer)
            if self.target_type_ == "binary":
                n_output_units = 1
                output_activation = "sigmoid"
                loss = "binary_crossentropy"
            elif self.target_type_ == "multiclass":
                n_output_units = self.n_classes_
                output_activation = "softmax"
                loss = "sparse_categorical_crossentropy"
            else:
                raise NotImplementedError(f"Unsupported task type: {self.target_type_}")
            out = keras.layers.Dense(n_output_units, activation=output_activation)
            model.add(out)
            model.compile(loss=loss, optimizer=compile_kwargs["optimizer"])
            return model
    
    estimator2 = KerasMLPClassifier()
    
    search2 = SequentialBackwardSearch(estimator2, 1)
    
    search2.fit(X, X, y, y)
    
    assert list(search2.indices_) == [4]
    

    Notebook version (can't promise this will be around forever): https://colab.research.google.com/drive/1EWxT3GWZsqhftz4f7W5GsXNe_SPtva4H#scrollTo=chU7wLn1BTU1