I have created a class for word2vec vectorisation which is working fine. But when I create a model pickle file and use that pickle file in a Flask App, I am getting an error like:
AttributeError: module
'__main__'
has no attribute 'GensimWord2VecVectorizer'
I am creating the model on Google Colab.
Code in Jupyter Notebook:
# Word2Vec Model
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import Word2Vec
class GensimWord2VecVectorizer(BaseEstimator, TransformerMixin):
def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None,
sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5,
ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False,
callbacks=(), max_final_vocab=None):
self.size = size
self.alpha = alpha
self.window = window
self.min_count = min_count
self.max_vocab_size = max_vocab_size
self.sample = sample
self.seed = seed
self.workers = workers
self.min_alpha = min_alpha
self.sg = sg
self.hs = hs
self.negative = negative
self.ns_exponent = ns_exponent
self.cbow_mean = cbow_mean
self.hashfxn = hashfxn
self.iter = iter
self.null_word = null_word
self.trim_rule = trim_rule
self.sorted_vocab = sorted_vocab
self.batch_words = batch_words
self.compute_loss = compute_loss
self.callbacks = callbacks
self.max_final_vocab = max_final_vocab
def fit(self, X, y=None):
self.model_ = Word2Vec(
sentences=X, corpus_file=None,
size=self.size, alpha=self.alpha, window=self.window, min_count=self.min_count,
max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed,
workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs,
negative=self.negative, ns_exponent=self.ns_exponent, cbow_mean=self.cbow_mean,
hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word,
trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words,
compute_loss=self.compute_loss, callbacks=self.callbacks,
max_final_vocab=self.max_final_vocab)
return self
def transform(self, X):
X_embeddings = np.array([self._get_embedding(words) for words in X])
return X_embeddings
def _get_embedding(self, words):
valid_words = [word for word in words if word in self.model_.wv.vocab]
if valid_words:
embedding = np.zeros((len(valid_words), self.size), dtype=np.float32)
for idx, word in enumerate(valid_words):
embedding[idx] = self.model_.wv[word]
return np.mean(embedding, axis=0)
else:
return np.zeros(self.size)
# column transformer
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([
('step1', GensimWord2VecVectorizer(), 'STATUS')
], remainder='drop')
# Create Model
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pickle
import numpy as np
import dill
import torch
# ##########
# SVC - support vector classifier
# ##########
# defining parameter range
hyperparameters = {'C': [0.1, 1],
'gamma': [1, 0.1],
'kernel': ['rbf'],
'probability': [True]}
model_sv = Pipeline([
('column_transformers', ct),
('model', GridSearchCV(SVC(), hyperparameters,
refit=True, verbose=3)),
])
model_sv_cEXT = model_sv.fit(X_train, y_train['cEXT'])
# Save the trained cEXT - SVM Model.
import joblib
joblib.dump(model_sv_cEXT, 'model_Word2Vec_sv_cEXT.pkl')
Code in Flask App:
# Word2Vec
model_EXT_WV_SV = joblib.load('utility/model/MachineLearning/SVM/model_Word2Vec_sv_cEXT.pkl')
I tried to copy the same class into my Flask file, but it is also not working.
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import Word2Vec
class GensimWord2VecVectorizer(BaseEstimator, TransformerMixin):
def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None,
sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5,
ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False,
callbacks=(), max_final_vocab=None):
self.size = size
self.alpha = alpha
self.window = window
self.min_count = min_count
self.max_vocab_size = max_vocab_size
self.sample = sample
self.seed = seed
self.workers = workers
self.min_alpha = min_alpha
self.sg = sg
self.hs = hs
self.negative = negative
self.ns_exponent = ns_exponent
self.cbow_mean = cbow_mean
self.hashfxn = hashfxn
self.iter = iter
self.null_word = null_word
self.trim_rule = trim_rule
self.sorted_vocab = sorted_vocab
self.batch_words = batch_words
self.compute_loss = compute_loss
self.callbacks = callbacks
self.max_final_vocab = max_final_vocab
def fit(self, X, y=None):
self.model_ = Word2Vec(
sentences=X, corpus_file=None,
size=self.size, alpha=self.alpha, window=self.window, min_count=self.min_count,
max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed,
workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs,
negative=self.negative, ns_exponent=self.ns_exponent, cbow_mean=self.cbow_mean,
hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word,
trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words,
compute_loss=self.compute_loss, callbacks=self.callbacks,
max_final_vocab=self.max_final_vocab)
return self
def transform(self, X):
X_embeddings = np.array([self._get_embedding(words) for words in X])
return X_embeddings
def _get_embedding(self, words):
valid_words = [word for word in words if word in self.model_.wv.vocab]
if valid_words:
embedding = np.zeros((len(valid_words), self.size), dtype=np.float32)
for idx, word in enumerate(valid_words):
embedding[idx] = self.model_.wv[word]
return np.mean(embedding, axis=0)
else:
return np.zeros(self.size)
# Word2Vec
model_EXT_WV_SV = joblib.load('utility/model/MachineLearning/SVM/model_Word2Vec_sv_cEXT.pkl')
GitHub code: https://github.com/Juned-Ansari/test
Pickle file: https://github.com/Juned-Ansari/test/blob/main/model_Word2Vec_sv_cEXT.pkl
Flask Web App: https://github.com/Juned-Ansari/test/tree/main/WebApp
From https://docs.python.org/3/library/pickle.html:
pickle
can save and restore class instances transparently, however the class definition must be importable and live in the same module as when the object was stored.
The following types can be pickled:
- ...
- classes that are defined at the top level of a module
- instances of such classes ...
Considering your directory structure:
├── WebApp/
│ └── app.py
└── Untitled.ipynb
And assuming you flask run
from within WebApp/
, so app
is a top-level module.
First, move class GensimWord2VecVectorizer
to the top level of WebApp/app.py
.
Next, in your Jupyter Notebook, import GensimWord2VecVectorizer
and trick pickle
to think it's from a top-level app
module:
from WebApp.app import GensimWord2VecVectorizer
GensimWord2VecVectorizer.__module__ = 'app'
import sys
sys.modules['app'] = sys.modules['WebApp.app']
Then you should be able to dump
and load
the pickle file.
If it's troublesome to import local modules, do this instead:
GensimWord2VecVectorizer.__module__ = 'app'
import sys
app = sys.modules['app'] = type(sys)('app')
app.GensimWord2VecVectorizer = GensimWord2VecVectorizer
Then you should be able to dump
and load
the pickle file.