Search code examples
pythonpandasnumpymachine-learningsklearn-pandas

After running a model how do I save an Isolation Forest and a Local Outlier Factor as two different models?


I have been trying to write a machine learning program to detect credit card fraud using Isolation Forest and Local Outlier Factor methods from sklearn and pandas.

I have the code running and making predictions but I can't figure out how to save each of them as different models. I have been following some examples but don't know where and how to save it. I think it is something like .save('Isolation.h5') and .save('Outlier.h5') but I'm not sure what to put in front of the .save.

If anyone could help me understand how to save each model that would be greatly appreciated.

My current code:

import numpy
import pandas
import matplotlib
import seaborn
import scipy

# import the necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset from the csv file using pandas
data = pd.read_csv('C:/Users/super/OneDrive/Documents/School/Spring 2020/CS 657/Final Project/creditcard.csv')

# Start exploring the dataset
print(data.columns)

data = data.sample(frac=0.1, random_state = 1)
print(data.shape)
print(data.describe())

# V1 - V28 are the results of a PCA Dimensionality reduction to protect user identities and sensitive features

# Plot histograms of each parameter 
data.hist(figsize = (20, 20))
plt.show()

# Determine number of fraud cases in dataset

Fraud = data[data['Class'] == 1]
Valid = data[data['Class'] == 0]

outlier_fraction = len(Fraud)/float(len(Valid))
print(outlier_fraction)

print('Fraud Cases: {}'.format(len(data[data['Class'] == 1])))
print('Valid Transactions: {}'.format(len(data[data['Class'] == 0])))

# Correlation matrix
corrmat = data.corr()
fig = plt.figure(figsize = (12, 9))

sns.heatmap(corrmat, vmax = .8, square = True)
plt.show()

# Get all the columns from the dataFrame
columns = data.columns.tolist()

# Filter the columns to remove data we do not want
columns = [c for c in columns if c not in ["Class"]]

# Store the variable we'll be predicting on
target = "Class"

X = data[columns]
Y = data[target]

# Print shapes
print(X.shape)
print(Y.shape)

from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

# define random states
state = 1

# define outlier detection tools to be compared
classifiers = {
    "Isolation Forest": IsolationForest(max_samples=len(X),
                                        contamination=outlier_fraction,
                                        random_state=state),
    "Local Outlier Factor": LocalOutlierFactor(
        n_neighbors=20,
        contamination=outlier_fraction)}

# Fit the model
plt.figure(figsize=(9, 7))
n_outliers = len(Fraud)


for i, (clf_name, clf) in enumerate(classifiers.items()):

    # fit the data and tag outliers
    if clf_name == "Local Outlier Factor":
        y_pred = clf.fit_predict(X)
        scores_pred = clf.negative_outlier_factor_
    else:
        clf.fit(X)
        scores_pred = clf.decision_function(X)
        y_pred = clf.predict(X)

    # Reshape the prediction values to 0 for valid, 1 for fraud. 
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1

    n_errors = (y_pred != Y).sum()

    # Run classification metrics
    print('{}: {}'.format(clf_name, n_errors))
    print(accuracy_score(Y, y_pred))
    print(classification_report(Y, y_pred))




Solution

  • Since you loop over all the classifiers and train them/make the predictions, you can simply save the model at the same time.

    For example, using pickle:

    import pickle
    
    def save_model(clf, filename):
        with open(filename, 'wb') as f:
            pickle.dump(clf, f)
    
    for i, (clf_name, clf) in enumerate(classifiers.items()):
    
        # fit the data and tag outliers
        if clf_name == "Local Outlier Factor":
            y_pred = clf.fit_predict(X)
            scores_pred = clf.negative_outlier_factor_
            save_model(clf, 'Outlier.pkl')  # Saving the LOF
        else:
            clf.fit(X)
            scores_pred = clf.decision_function(X)
            y_pred = clf.predict(X)
            save_model(clf, 'Isolation.pkl')  # Saving the isolation forest
    
        ...
    

    You can then load the models using:

    def load_model(filename):
        with open(filename, 'rb') as f:
            clf = pickle.load(f)
        return clf
    

    You can save in another format as well, the idea is exactly the same independent of the package used.