python flask jupyter-notebook google-colaboratory pickle

Unpickle instance from Jupyter Notebook in Flask App

I have created a class for word2vec vectorisation which is working fine. But when I create a model pickle file and use that pickle file in a Flask App, I am getting an error like:

AttributeError: module '__main__' has no attribute 'GensimWord2VecVectorizer'

I am creating the model on Google Colab.

Code in Jupyter Notebook:

# Word2Vec Model
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import Word2Vec

class GensimWord2VecVectorizer(BaseEstimator, TransformerMixin):

    def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None,
                 sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5,
                 ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
                 trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False,
                 callbacks=(), max_final_vocab=None):
        self.size = size
        self.alpha = alpha
        self.window = window
        self.min_count = min_count
        self.max_vocab_size = max_vocab_size
        self.sample = sample
        self.seed = seed
        self.workers = workers
        self.min_alpha = min_alpha
        self.sg = sg
        self.hs = hs
        self.negative = negative
        self.ns_exponent = ns_exponent
        self.cbow_mean = cbow_mean
        self.hashfxn = hashfxn
        self.iter = iter
        self.null_word = null_word
        self.trim_rule = trim_rule
        self.sorted_vocab = sorted_vocab
        self.batch_words = batch_words
        self.compute_loss = compute_loss
        self.callbacks = callbacks
        self.max_final_vocab = max_final_vocab

    def fit(self, X, y=None):
        self.model_ = Word2Vec(
            sentences=X, corpus_file=None,
            size=self.size, alpha=self.alpha, window=self.window, min_count=self.min_count,
            max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed,
            workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs,
            negative=self.negative, ns_exponent=self.ns_exponent, cbow_mean=self.cbow_mean,
            hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word,
            trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words,
            compute_loss=self.compute_loss, callbacks=self.callbacks,
            max_final_vocab=self.max_final_vocab)
        return self

    def transform(self, X):
        X_embeddings = np.array([self._get_embedding(words) for words in X])
        return X_embeddings

    def _get_embedding(self, words):
        valid_words = [word for word in words if word in self.model_.wv.vocab]
        if valid_words:
            embedding = np.zeros((len(valid_words), self.size), dtype=np.float32)
            for idx, word in enumerate(valid_words):
                embedding[idx] = self.model_.wv[word]

            return np.mean(embedding, axis=0)
        else:
            return np.zeros(self.size)

# column transformer
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([
    ('step1', GensimWord2VecVectorizer(), 'STATUS')
], remainder='drop')

# Create Model
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pickle
import numpy as np
import dill
import torch
# ##########
# SVC - support vector classifier
# ##########
# defining parameter range
hyperparameters = {'C': [0.1, 1],
                   'gamma': [1, 0.1],
                   'kernel': ['rbf'],
                   'probability': [True]}
model_sv = Pipeline([
    ('column_transformers', ct),
    ('model', GridSearchCV(SVC(), hyperparameters,
                           refit=True, verbose=3)),
])
model_sv_cEXT = model_sv.fit(X_train, y_train['cEXT'])
# Save the trained cEXT - SVM Model.
import joblib
joblib.dump(model_sv_cEXT, 'model_Word2Vec_sv_cEXT.pkl')

Code in Flask App:

# Word2Vec
model_EXT_WV_SV = joblib.load('utility/model/MachineLearning/SVM/model_Word2Vec_sv_cEXT.pkl')

I tried to copy the same class into my Flask file, but it is also not working.

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import Word2Vec

class GensimWord2VecVectorizer(BaseEstimator, TransformerMixin):

    def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None,
                 sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5,
                 ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
                 trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False,
                 callbacks=(), max_final_vocab=None):
        self.size = size
        self.alpha = alpha
        self.window = window
        self.min_count = min_count
        self.max_vocab_size = max_vocab_size
        self.sample = sample
        self.seed = seed
        self.workers = workers
        self.min_alpha = min_alpha
        self.sg = sg
        self.hs = hs
        self.negative = negative
        self.ns_exponent = ns_exponent
        self.cbow_mean = cbow_mean
        self.hashfxn = hashfxn
        self.iter = iter
        self.null_word = null_word
        self.trim_rule = trim_rule
        self.sorted_vocab = sorted_vocab
        self.batch_words = batch_words
        self.compute_loss = compute_loss
        self.callbacks = callbacks
        self.max_final_vocab = max_final_vocab

    def fit(self, X, y=None):
        self.model_ = Word2Vec(
            sentences=X, corpus_file=None,
            size=self.size, alpha=self.alpha, window=self.window, min_count=self.min_count,
            max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed,
            workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs,
            negative=self.negative, ns_exponent=self.ns_exponent, cbow_mean=self.cbow_mean,
            hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word,
            trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words,
            compute_loss=self.compute_loss, callbacks=self.callbacks,
            max_final_vocab=self.max_final_vocab)
        return self

    def transform(self, X):
        X_embeddings = np.array([self._get_embedding(words) for words in X])
        return X_embeddings

    def _get_embedding(self, words):
        valid_words = [word for word in words if word in self.model_.wv.vocab]
        if valid_words:
            embedding = np.zeros((len(valid_words), self.size), dtype=np.float32)
            for idx, word in enumerate(valid_words):
                embedding[idx] = self.model_.wv[word]

            return np.mean(embedding, axis=0)
        else:
            return np.zeros(self.size)

# Word2Vec
model_EXT_WV_SV = joblib.load('utility/model/MachineLearning/SVM/model_Word2Vec_sv_cEXT.pkl')

GitHub code: https://github.com/Juned-Ansari/test

Pickle file: https://github.com/Juned-Ansari/test/blob/main/model_Word2Vec_sv_cEXT.pkl

Flask Web App: https://github.com/Juned-Ansari/test/tree/main/WebApp

Solution

From https://docs.python.org/3/library/pickle.html:

pickle can save and restore class instances transparently, however the class definition must be importable and live in the same module as when the object was stored.

The following types can be pickled:

...

classes that are defined at the top level of a module

instances of such classes ...

Considering your directory structure:

├── WebApp/
│  └── app.py
└── Untitled.ipynb

And assuming you flask run from within WebApp/, so app is a top-level module.

First, move class GensimWord2VecVectorizer to the top level of WebApp/app.py.

Next, in your Jupyter Notebook, import GensimWord2VecVectorizer and trick pickle to think it's from a top-level app module:

from WebApp.app import GensimWord2VecVectorizer
GensimWord2VecVectorizer.__module__ = 'app'

import sys
sys.modules['app'] = sys.modules['WebApp.app']

Then you should be able to dump and load the pickle file.

Google Colab

If it's troublesome to import local modules, do this instead:

GensimWord2VecVectorizer.__module__ = 'app'

import sys
app = sys.modules['app'] = type(sys)('app')
app.GensimWord2VecVectorizer = GensimWord2VecVectorizer

Then you should be able to dump and load the pickle file.