Search code examples
pythonmodelclassificationprediction

How to create a predictive model using Mahalanobis Distance outlier in python


I found online and worked on Multivariate outlier (Mahalanobis Distance) using Linear Discriminant Analysis (LDA) as an input. Here are example LDA coordinates:

LDA coord:
(EX:2)
           0         1                  2
0  -3.132160  0.032012                 C0
1  -1.924197  1.092878                 C0
2   0.506485  2.169236                  C0
3  -1.841936  2.970699                  C0
4   1.663835  0.902320                  C0
5   1.347670  2.507184                  C0
6   0.906082 -0.616425                  C0
7  -0.424781  3.194779                  C0
8   0.710616  1.747661                  C0
9   0.319855  3.083899                  C0
10 -3.249343 -1.302349                C1
11 -3.894662 -0.098001                 C1
12 -3.748814 -0.508311                 C1
13 -2.323617 -0.883107                 C1
14 -3.795406 -1.998027                 C1
15 -3.553759  2.075790                 C1
16 -2.258489  0.340000                 C1
17 -3.396124  1.484340                 C1
18 -2.711332 -1.081713                 C1
19 -3.405522 -1.233175                 C1
20 -3.145327 -0.487384                 C2
21 -2.124185 -0.326975                 C2
22 -1.754098 -0.123257                 C2
23 -1.938832 -0.398431                 C2
24 -3.454642 -0.131798                 C2
25 -2.405942  0.425208                 C2
26 -2.625229 -0.101265                 C2
27 -1.981016 -0.048747                 C2
28 -2.245377 -0.578808                 C2
29 -2.745318 -1.076790                 C2
30 -1.621120 -1.191785                 C3
31 -1.794487 -0.791839                 C3
32  0.487554 -0.456122                 C3
33 -0.700520 -0.704317                 C3
34 -1.626866 -1.566346                 C3
35 -2.943879 -0.182724                 C3
36 -1.049298  0.083431                 C3
37 -1.169427 -0.688795                 C3
38 -0.789618 -0.720257                 C3
39 -1.002526 -1.489786                 C3
40  7.725554  0.069060                  C4
41  7.161180  0.766171                  C4
42  5.567454 -0.540341                  C4
43  7.199526  0.598618                  C4
44  7.060474  0.326986                  C4
45  8.107402 -0.733354                  C4
46  8.136742 -3.016891                  C4
47  5.209265 -0.586597                  C4
48  7.800402 -0.147399                  C4
49  6.867754 -0.059155                  C4
 
(EX:3)
           0         1                    2
0  -1.949488  2.979766                   C0
1  -1.406498  2.181719                   C0
2   0.054572  1.097451                   C0
3  -0.935271  5.312452                   C0
4   1.598947  3.535428                   C0
5  -0.731745 -0.173804                  C0
6   1.302720  2.027883                   C0
7  -0.654739  1.961010                   C0
8   0.248038  1.141469                   C0
9  -1.693209 -0.649851                   C0
10 -3.199713 -2.103417                  C0
11 -2.755474 -0.456333                  C1
12 -2.474350 -0.734787                  C1
13 -2.225912 -0.887536                  C1
14 -2.565908 -0.972269                  C1
15 -2.508166 -0.524934                  C1
16 -1.973947 -0.109873                  C1
17 -2.679097 -0.172011                  C1
18 -2.049151 -0.847307                  C1
19 -2.513344 -0.138586                  C1
20 -2.557531 -1.723377                  C1
21 -1.389194 -0.284962                  C2
22 -0.776032  0.221122                  C2
23 -1.387943 -0.026513                  C2
24 -1.510273  0.816104                  C2
25 -1.674479 -0.207244                  C2
26 -2.121766 -0.157644                  C2
27 -1.904922 -0.492034                  C2
28 -1.449933 -0.338953                  C2
29 -2.153559 -0.146933                  C2
30 -0.720952 -1.334875                  C2
31 -1.437556 -0.511429                  C3
32  1.096347  0.205410                  C3
33 -0.275734 -0.427320                  C3
34 -0.953779 -1.263496                  C3
35 -1.889583 -0.929957                  C3
36 -1.521789 -0.355877                  C3
37 -1.967298 -1.285574                  C3
38 -0.482127 -0.381481                  C3
39 -1.073449 -0.345657                  C3
40  6.792683 -1.586820                  C4
41  6.136840 -0.290805                  C4
42  4.890663  0.785053                  C4
43  6.499080 -0.979607                  C4
44  6.753821 -1.491083                  C4
45  1.589010  0.369480                  C4
46  5.351898  0.888101                  C4
47  5.568847 -0.401511                  C4
48  5.335149  0.515722                  C4
49  6.345298 -1.304311                   C4

Here are my functions and code:

def is_pos_def(A):

    if np.allclose(A, A.T):

        try:

            np.linalg.cholesky(A)

            return True

        except np.linalg.LinAlgError:

            return False

    else:

        return False

 

def cov_matrix(data, verbose=False):

    # data = pd.DataFrame(data).to_numpy()

    covariance_matrix = np.cov(data, rowvar=False)

    # # fit a MCD robust estimator to data

    # robust_cov = MinCovDet().fit(data)

    # covariance_matrix = robust_cov.covariance_

    # # fit a MLE estimator to data

    # emp_cov = EmpiricalCovariance().fit(data)

    # covariance_matrix = emp_cov.covariance_

    if is_pos_def(covariance_matrix):

        inv_covariance_matrix = np.linalg.inv(covariance_matrix)

        if is_pos_def(inv_covariance_matrix):

            return covariance_matrix, inv_covariance_matrix

        else:

            print("Error: Inverse of Covariance Matrix is not positive definite!")

    else:

        print("Error: Covariance Matrix is not positive definite!")

       

def MahalanobisDist(inv_cov_matrix, mean_distr, data, verbose=False):

    inv_covariance_matrix = inv_cov_matrix

    vars_mean = mean_distr

    diff = data - vars_mean

    md = []

    # for i in range(len(diff)):

    #     md.append(np.sqrt(diff[i].dot(inv_covariance_matrix).dot(diff[i])))

    md.append(np.sqrt(np.diag(np.linalg.multi_dot([diff, inv_covariance_matrix, diff.T]))))

    return md

 

def MD_detectOutliers(dist, extreme=False, verbose=False):

    k = 3. if extreme else 2.

    threshold = np.mean(dist) * k

    outliers = []

    for i in range(len(dist)):

        if dist[i] >= threshold:

            outliers.append(i)  # index of the outlier

    return np.array(outliers)

 

def MD_threshold(dist, extreme=False, verbose=False):

    k = 3. if extreme else 2.

    threshold = np.mean(dist) * k

   

    # # Tukeys method:

    # PDF_Q1 = np.quantile(dist, 0.25)

    # PDF_Q3 = np.quantile(dist, 0.75)

    # IQR = PDF_Q3 - PDF_Q1

    # threshold = PDF_Q3 + (1.5 * IQR)

   

    # Assuming is Chi-distributed data:   

    # threshold = np.sqrt(st.chi2.ppf((1-(k/100)), df=np.array(dist).shape[0]))    #degrees of freedom = number of variables

   

    # # Assuming is norm-distributed data:

    # k = 0.997 if extreme else 0.95

    # params = st.norm.fit(dist)

    # # Separate parts of parameters

    # arg = params[:-2]

    # loc = params[-2]

    # scale = params[-1]

    # # Get sane start and end points of distribution

    # threshold = st.norm.ppf(k, *arg, loc=loc, scale=scale) if arg else dist.ppf(k, loc=loc, scale=scale)

    # threshold = st.norm.ppf(k, loc=np.mean(dist), scale=np.sqrt(np.var(dist)))    # loc = mean of distance training, scale = std of the distance training

    # threshold = st.norm.ppf(0.997, loc=np.mean(dist), scale=np.sqrt(np.var(dist)))    # loc = mean of distance training, scale = std of the distance training

    # dist_pdf = sns.distplot(dist, bins = 10, kde= True, color = 'blue').get_lines()[0].get_data()

    # threshold = np.quantile(dist_pdf, 0.95)

    return threshold


class MahalanobisOneclassClassifier():
def __init__(self, X_train, threshold):
    self.X_train = X_train
    self.threshold = threshold
    print('Critical value is: ', self.threshold)

def predict_proba(self, X_test):
    mahalanobis_dist = X_test
    return mahalanobis_dist

def predict(self, X_test):
    # predict_lst = []
    dist = self.predict_proba(X_test)
    dist = dist.to_numpy()
    dist = dist.flatten()
    # print(dist.flatten())
    predict_lst = [int(dist_val >= self.threshold) for dist_val in dist]
    # for i in range(len(dist)):
    #     if dist[i] >= threshold:
    #         predict_lst.append(int(dist[i]))
    return predict_lst

######## Anomaly Detection:
## Using Mahalanobis distance metric:
# Inputting the damage cases:
Case_0_LDA_dataframe = Case_0_LDA_dataframe.drop(['2'], axis=1)  # df.columns is zero-based pd.Index
Case_1_LDA_dataframe = Case_1_LDA_dataframe.drop(['2'], axis=1)  # df.columns is zero-based pd.Index
Case_2_LDA_dataframe = Case_2_LDA_dataframe.drop(['2'], axis=1)  # df.columns is zero-based pd.Index
Case_3_LDA_dataframe = Case_3_LDA_dataframe.drop(['2'], axis=1)  # df.columns is zero-based pd.Index
Case_4_LDA_dataframe = Case_4_LDA_dataframe.drop(['2'], axis=1)  # df.columns is zero-based pd.Index
    
data_train = np.array(Case_0_LDA_dataframe.values)
data_test_C1 = np.array(Case_1_LDA_dataframe.values)
data_test_C2 = np.array(Case_2_LDA_dataframe.values)
data_test_C3 = np.array(Case_3_LDA_dataframe.values)
data_test_C4 = np.array(Case_4_LDA_dataframe.values)

data_train_df = pd.DataFrame(Case_0_LDA_dataframe.values)
data_test_df_C1 =  pd.DataFrame(Case_1_LDA_dataframe.values)
data_test_df_C2 =  pd.DataFrame(Case_2_LDA_dataframe.values)
data_test_df_C3 =  pd.DataFrame(Case_3_LDA_dataframe.values)
data_test_df_C4 =  pd.DataFrame(Case_4_LDA_dataframe.values)

# Calculating the covariance matrix:
covar_matrix, inv_covar_matrix = cov_matrix(data=data_train)

# Calculating the mean value for the input variables:
mean_distr = data_train_df.mean(axis=0)
# rob_cov = MinCovDet(random_state=0).fit(data_train_df)
# robust_mean = rob_cov.location_  #robust mean
# mean_distr = robust_mean

# Calculating the Mahalanobis distance and threshold value to flag datapoints as an anomaly:
dist_test_C1 = MahalanobisDist(inv_covar_matrix, mean_distr, data_test_df_C1, verbose=True)
dist_test_C2 = MahalanobisDist(inv_covar_matrix, mean_distr, data_test_df_C2, verbose=True)
dist_test_C3 = MahalanobisDist(inv_covar_matrix, mean_distr, data_test_df_C3, verbose=True)
dist_test_C4 = MahalanobisDist(inv_covar_matrix, mean_distr, data_test_df_C4, verbose=True)
dist_train = MahalanobisDist(inv_covar_matrix, mean_distr, data_train_df, verbose=True)
threshold = MD_threshold(dist_train, extreme = False)

# Distribution of Threshold value for flagging an anomaly:
plt.figure()
sns.distplot(np.square(dist_train),bins = 10, kde= False)
# plt.xlim([0.0,15])
plt.show()

plt.figure()
sns.distplot(dist_train, bins = 10, kde= True, color = 'green');
# plt.xlim([0.0,5])
plt.xlabel('Mahalanobis dist')
plt.show()

anomaly_train = pd.DataFrame(index=data_train_df.index)
anomaly_train['Mob_dist']= dist_train[0]
anomaly_train['Thresh'] = threshold
# If Mob_dist above threshold: Flag as anomaly
anomaly_train['Anomaly'] = anomaly_train['Mob_dist'] > anomaly_train['Thresh']
anomaly_train['Case'] = 'C0'
anomaly_train.index = data_train_df.index

anomaly_C1 = pd.DataFrame(index=data_test_df_C1.index)
anomaly_C1['Mob_dist']= dist_test_C1[0]
anomaly_C1['Thresh'] = threshold
# If Mob_dist above threshold: Flag as anomaly
anomaly_C1['Anomaly'] = anomaly_C1['Mob_dist'] > anomaly_C1['Thresh']
anomaly_C1['Case'] = 'C1'
anomaly_C1.index = data_test_df_C1.index
anomaly_C1.head()

anomaly_C2 = pd.DataFrame(index=data_test_df_C2.index)
anomaly_C2['Mob_dist']= dist_test_C2[0]
anomaly_C2['Thresh'] = threshold
# If Mob_dist above threshold: Flag as anomaly
anomaly_C2['Anomaly'] = anomaly_C2['Mob_dist'] > anomaly_C2['Thresh']
anomaly_C2['Case'] = 'C2'
anomaly_C2.index = data_test_df_C2.index
anomaly_C2.head()

anomaly_C3 = pd.DataFrame(index=data_test_df_C3.index)
anomaly_C3['Mob_dist']= dist_test_C3[0]
anomaly_C3['Thresh'] = threshold
# If Mob_dist above threshold: Flag as anomaly
anomaly_C3['Anomaly'] = anomaly_C3['Mob_dist'] > anomaly_C3['Thresh']
anomaly_C3['Case'] = 'C3'
anomaly_C3.index = data_test_df_C3.index
anomaly_C3.head()

anomaly_C4 = pd.DataFrame(index=data_test_df_C4.index)
anomaly_C4['Mob_dist']= dist_test_C4[0]
anomaly_C4['Thresh'] = threshold
# If Mob_dist above threshold: Flag as anomaly
anomaly_C4['Anomaly'] = anomaly_C4['Mob_dist'] > anomaly_C4['Thresh']
anomaly_C4['Case'] = 'C4'
anomaly_C4.index = data_test_df_C4.index
anomaly_C4.head()

final_scored_md = pd.concat([anomaly_train, anomaly_C1, anomaly_C2, anomaly_C3, anomaly_C4], ignore_index=True)
# final_scored_md = pd.concat([anomaly_train, anomaly_C1, anomaly_C2, anomaly_C3, anomaly_C4])
print(final_scored_md)

# Plotting the observation vs Mahalanobis distance:
final_scored_len = final_scored_md.shape
obser = np.arange(1, final_scored_len[0]+1)
dfc = final_scored_md.query('Mob_dist > Thresh')
# obser_dfc = np.arange(dfc.shape)

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Observation', fontsize = 15)
ax.set_ylabel('Mahalanobis distance', fontsize = 15)
ax.set_title('Mahalanobis distance plot of Example {0}'.format(sensor_no), fontsize = 20)
targets = ['C0', 'C1', 'C2', 'C3', 'C4']
colors = ['blue', 'yellow', 'green', 'cyan', 'purple']
final_scored_md_gp = final_scored_md.groupby("Case")
for name, group in final_scored_md_gp:
    ax.scatter(group.index, group['Mob_dist'], s = 50, label=name)
ax.axhline(y=threshold, color='k', linestyle='--')
ax.scatter(dfc.index, dfc['Mob_dist'], c = 'red', s = 50, label='Anomaly')
ax.legend()
ax.grid()
fig.tight_layout()
plt.show()


######## Classification part:
# Separating MSD Damage cases:
Case_0_MSD = final_scored_md.loc[final_scored_md['Case'] == 'C0']
Case_1_MSD = final_scored_md.loc[final_scored_md['Case'] == 'C1']
Case_2_MSD = final_scored_md.loc[final_scored_md['Case'] == 'C2']
Case_3_MSD = final_scored_md.loc[final_scored_md['Case'] == 'C3']
Case_4_MSD = final_scored_md.loc[final_scored_md['Case'] == 'C4']

## Training dataframe:
# Case_0_MSD_train = Case_0_MSD.iloc[:7]
# Case_1_MSD_train = Case_1_MSD.iloc[:7]
# Case_2_MSD_train = Case_2_MSD.iloc[:7]
# Case_3_MSD_train = Case_3_MSD.iloc[:7]
# Case_4_MSD_train = Case_4_MSD.iloc[:7]
Case_0_MSD_train = Case_0_MSD.sample(n=7)
Case_1_MSD_train = Case_1_MSD.sample(n=7)
Case_2_MSD_train = Case_2_MSD.sample(n=7)
Case_3_MSD_train = Case_3_MSD.sample(n=7)
Case_4_MSD_train = Case_4_MSD.sample(n=7)
frames_train = [Case_0_MSD_train, Case_1_MSD_train, Case_2_MSD_train, Case_3_MSD_train, Case_4_MSD_train]
dataframe_train = pd.concat(frames_train)

## Testing dataframe:
# Case_0_MSD_test = Case_0_MSD.iloc[-3:]
# Case_1_MSD_test = Case_1_MSD.iloc[-3:]
# Case_2_MSD_test = Case_2_MSD.iloc[-3:]
# Case_3_MSD_test = Case_3_MSD.iloc[-3:]
# Case_4_MSD_test = Case_4_MSD.iloc[-3:]
Case_0_MSD_test = Case_0_MSD.sample(n=3)
Case_1_MSD_test = Case_1_MSD.sample(n=3)
Case_2_MSD_test = Case_2_MSD.sample(n=3)
Case_3_MSD_test = Case_3_MSD.sample(n=3)
Case_4_MSD_test = Case_4_MSD.sample(n=3)
frames_test = [Case_0_MSD_test, Case_1_MSD_test, Case_2_MSD_test, Case_3_MSD_test, Case_4_MSD_test]
dataframe_test = pd.concat(frames_test)
   
# Keeping MSD column in training dataset:
dataframe_train = dataframe_train.drop(['Case', 'Thresh'], axis=1)  # df.columns is zero-based pd.Index
true_y_class_train = dataframe_train['Anomaly']
true_y_class_train = true_y_class_train.astype(int)
true_y_class_train = true_y_class_train.transpose()
true_y_class_train = true_y_class_train.to_numpy()
dataframe_train = dataframe_train.drop(['Anomaly'], axis=1)  # df.columns is zero-based pd.Index

# Keeping MSD column in testing dataset:
dataframe_test = dataframe_test.drop(['Case', 'Thresh'], axis=1)  # df.columns is zero-based pd.Index
true_y_class_test = dataframe_test['Anomaly']
true_y_class_test = true_y_class_test.astype(int)
true_y_class_test = true_y_class_test.transpose()
true_y_class_test = true_y_class_test.to_numpy()
dataframe_test = dataframe_test.drop(['Anomaly'], axis=1)  # df.columns is zero-based pd.Index

clf = MahalanobisOneclassClassifier(dataframe_train, threshold)
mahalanobis_dist = clf.predict_proba(dataframe_test)
pred_mahalanobis_dist_class = clf.predict(dataframe_test)
print(mahalanobis_dist)
print(pred_mahalanobis_dist_class)

# Pred and Truth
test_acc = accuracy_score(true_y_class_test, pred_mahalanobis_dist_class) * 100
print('The test set accuracy is %4.2f%%' % test_acc)

# Obtaining the report of the model:
print('Report of MSD: ')
print(classification_report(y_true=true_y_class_test, y_pred=pred_mahalanobis_dist_class))

targets = ['0', '1']

cnf_matrix = confusion_matrix(y_true=true_y_class_test, y_pred=pred_mahalanobis_dist_class)

print('Confusion Matrix of MSD: ')
print(cnf_matrix)

# Obtaining number of labels:
labels = list(set(true_y_class_test))
labels.sort()
print("Total labels: %s -> %s" % (len(labels), labels))

# Obtaining the dataframe of the confusion matrix:
df_conf = pd.DataFrame(data=confusion_matrix(true_y_class_test, pred_mahalanobis_dist_class, labels=labels), columns=labels,index=labels)
print('Confusion Matrix Dataframe:')
print(df_conf)

# Local (metrics per class) #
tps = {}
fps = {}
fns = {}
precision_local = {}
recall_local = {}
f1_local = {}
accuracy_local = {}
for label in labels:
    tps[label] = df_conf.loc[label, label]
    fps[label] = df_conf[label].sum() - tps[label]
    fns[label] = df_conf.loc[label].sum() - tps[label]
    tp, fp, fn = tps[label], fps[label], fns[label]

    precision_local[label] = tp / (tp + fp) if (tp + fp) > 0. else 0.
    recall_local[label] = tp / (tp + fn) if (tp + fp) > 0. else 0.
    p, r = precision_local[label], recall_local[label]

    f1_local[label] = 2. * p * r / (p + r) if (p + r) > 0. else 0.
    accuracy_local[label] = tp / (tp + fp + fn) if (tp + fp + fn) > 0. else 0.

print('\n')
print("#-- Local measures --#")
print("True Positives:", tps)
print("False Positives:", fps)
print("False Negatives:", fns)
print("Precision:", precision_local)
print("Recall:", recall_local)
print("F1-Score:", f1_local)
print("Accuracy:", accuracy_local)

# Global metrics #
micro_averages = {}
macro_averages = {}

correct_predictions = sum(tps.values())
den = sum(list(tps.values()) + list(fps.values()))
micro_averages["Precision"] = 1. * correct_predictions / den if den > 0. else 0.

den = sum(list(tps.values()) + list(fns.values()))
micro_averages["Recall"] = 1. * correct_predictions / den if den > 0. else 0.

micro_avg_p, micro_avg_r = micro_averages["Precision"], micro_averages["Recall"]
micro_averages["F1-score"] = 2. * micro_avg_p * micro_avg_r / (micro_avg_p + micro_avg_r) if (micro_avg_p + micro_avg_r) > 0. else 0.

macro_averages["Precision"] = np.mean(list(precision_local.values()))
macro_averages["Recall"] = np.mean(list(recall_local.values()))

macro_avg_p, macro_avg_r = macro_averages["Precision"], macro_averages["Recall"]
macro_averages["F1-Score"] = np.mean(list(f1_local.values()))

total_predictions = df_conf.values.sum()
accuracy_global = correct_predictions / total_predictions if total_predictions > 0. else 0.

print('\n')
print("#-- Global measures --#")
print("Micro-Averages:", micro_averages)
print("Macro-Averages:", macro_averages)
print("Correct predictions:", correct_predictions)
print("Total predictions:", total_predictions)
print("Accuracy:", accuracy_global * 100)

# TN (True Negative) #
tns = {}
for label in set(true_y_class_test):
    tns[label] = len(true_y_class_test) - (tps[label] + fps[label] + fns[label])
print("True Negatives:", tns)

accuracy_local_new = {}
for label in labels:
    tp, fp, fn, tn = tps[label], fps[label], fns[label], tns[label]
    accuracy_local_new[label] = (tp + tn) / (tp + fp + fn + tn) if (tp + fp + fn + tn) > 0. else 0.

total_true = sum(list(tps.values()) + list(tns.values()))
total_predictions = sum(list(tps.values()) + list(tns.values()) + list(fps.values()) + list(fns.values()))
accuracy_global_new = 1. * total_true / total_predictions if total_predictions > 0. else 0.

print("Accuracy (per class), with TNs:", accuracy_local_new)
print("Accuracy (per class), without TNs:", accuracy_local)
print("Accuracy (global), with TNs:", accuracy_global_new)
print("Accuracy (global), without TNs:", accuracy_global)

print('\n')

fig_1, ax_1 = plot_confusion_matrix(conf_mat=cnf_matrix, colorbar=True, show_absolute=True, show_normed=False, class_names=targets)
plt.title('Confusion matrix of MSD Model of Example {0}'.format(sensor_no))

fig_2, ax_2 = plot_confusion_matrix(conf_mat=cnf_matrix, colorbar=True, show_absolute=False, show_normed=True, class_names=targets)
plt.title('Normalized MSD confusion matrix of Example {0}'.format(sensor_no))
# plt.show()
plt.show(block=False)
plt.pause(1)
plt.close('all')

So, the code works by using the functions above and the results are the following in pictures:

enter image description here

enter image description here

So, I would like to create a Mahalanobis Distance predictive model based on the functions in code above and use the confusion matrix and classification report from sklearn metrics to detect for anomaly. I was also wondering what would go into the fitting and predict functions? I tried the following code which contains the class of MSD Classifier:

class MahalanobisOneclassClassifier():
def __init__(self, X_train, threshold):
    self.X_train = X_train
    self.threshold = threshold
    print('Critical value is: ', self.threshold)
 
def predict_proba(self, X_test):
    mahalanobis_dist = X_test
    return mahalanobis_dist
 
def predict(self, X_test):
    # predict_lst = []
    dist = self.predict_proba(X_test)
    dist = dist.to_numpy()
    dist = dist.flatten()
    # print(dist.flatten())
    predict_lst = [int(dist_val >= self.threshold) for dist_val in dist]
    # for i in range(len(dist)):
    #     if dist[i] >= threshold:
    #         predict_lst.append(int(dist[i]))
    return predict_lst

However, I keep on getting like 100% prediction everytime, whereas, in the image and creating coodrdinates of LDA, the other cases are far from Case 0. So, I would like to include the points in the cases below the threshold (other than case 0) to be like False negatives.


Solution

  • I worked on the model for some time and managed to narrow it down to the following code:

    class MahalanobisOneclassClassifier():
        def __init__(self, X_train, threshold):
            self.X_train = X_train
            self.threshold = threshold
            print('Critical value is: ', self.threshold)
    
        def predict_proba(self, X_test):
            mahalanobis_dist = X_test
            return mahalanobis_dist
    
        def predict(self, X_test):
            # To convert the data into numpy format: 
            dist = self.predict_proba(X_test)
            dist = dist.transpose()
            dist = dist.to_numpy()
    
            # To implement the prediction of the MD model:
            predict_lst = [int(dist_val >= self.threshold) for dist_val in dist[0]]
            return predict_lst
    

    However, to be able to include the points in the cases below the threshold (other than case 0) to be like False negatives, you should have like the true class output in order to compare the prediction with the true. This is further explained in the following code:

    true_y_class_test = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    
    clf = MahalanobisOneclassClassifier(dataframe_train, threshold)
    mahalanobis_dist = clf.predict_proba(dataframe_test)
    pred_mahalanobis_dist_class = clf.predict(dataframe_test)
    print(mahalanobis_dist)
    print(pred_mahalanobis_dist_class)
    
    # Pred and Truth
    test_acc = accuracy_score(true_y_class_test, pred_mahalanobis_dist_class) * 100
    print('The test set accuracy is %4.2f%%' % test_acc)
    
    # Obtaining the report of the model:
    print('Report of MSD: ')
    print(classification_report(y_true=true_y_class_test, y_pred=pred_mahalanobis_dist_class))
    
    targets = ['0', '1']
    
    cnf_matrix = confusion_matrix(y_true=true_y_class_test, y_pred=pred_mahalanobis_dist_class)
    
    print('Confusion Matrix of MSD: ')
    print(cnf_matrix)
    

    It is important to note that the list of the true class label is dependent on your case study.