I found online and worked on Multivariate outlier (Mahalanobis Distance) using Linear Discriminant Analysis (LDA) as an input. Here are example LDA coordinates:
LDA coord:
(EX:2)
0 1 2
0 -3.132160 0.032012 C0
1 -1.924197 1.092878 C0
2 0.506485 2.169236 C0
3 -1.841936 2.970699 C0
4 1.663835 0.902320 C0
5 1.347670 2.507184 C0
6 0.906082 -0.616425 C0
7 -0.424781 3.194779 C0
8 0.710616 1.747661 C0
9 0.319855 3.083899 C0
10 -3.249343 -1.302349 C1
11 -3.894662 -0.098001 C1
12 -3.748814 -0.508311 C1
13 -2.323617 -0.883107 C1
14 -3.795406 -1.998027 C1
15 -3.553759 2.075790 C1
16 -2.258489 0.340000 C1
17 -3.396124 1.484340 C1
18 -2.711332 -1.081713 C1
19 -3.405522 -1.233175 C1
20 -3.145327 -0.487384 C2
21 -2.124185 -0.326975 C2
22 -1.754098 -0.123257 C2
23 -1.938832 -0.398431 C2
24 -3.454642 -0.131798 C2
25 -2.405942 0.425208 C2
26 -2.625229 -0.101265 C2
27 -1.981016 -0.048747 C2
28 -2.245377 -0.578808 C2
29 -2.745318 -1.076790 C2
30 -1.621120 -1.191785 C3
31 -1.794487 -0.791839 C3
32 0.487554 -0.456122 C3
33 -0.700520 -0.704317 C3
34 -1.626866 -1.566346 C3
35 -2.943879 -0.182724 C3
36 -1.049298 0.083431 C3
37 -1.169427 -0.688795 C3
38 -0.789618 -0.720257 C3
39 -1.002526 -1.489786 C3
40 7.725554 0.069060 C4
41 7.161180 0.766171 C4
42 5.567454 -0.540341 C4
43 7.199526 0.598618 C4
44 7.060474 0.326986 C4
45 8.107402 -0.733354 C4
46 8.136742 -3.016891 C4
47 5.209265 -0.586597 C4
48 7.800402 -0.147399 C4
49 6.867754 -0.059155 C4
(EX:3)
0 1 2
0 -1.949488 2.979766 C0
1 -1.406498 2.181719 C0
2 0.054572 1.097451 C0
3 -0.935271 5.312452 C0
4 1.598947 3.535428 C0
5 -0.731745 -0.173804 C0
6 1.302720 2.027883 C0
7 -0.654739 1.961010 C0
8 0.248038 1.141469 C0
9 -1.693209 -0.649851 C0
10 -3.199713 -2.103417 C0
11 -2.755474 -0.456333 C1
12 -2.474350 -0.734787 C1
13 -2.225912 -0.887536 C1
14 -2.565908 -0.972269 C1
15 -2.508166 -0.524934 C1
16 -1.973947 -0.109873 C1
17 -2.679097 -0.172011 C1
18 -2.049151 -0.847307 C1
19 -2.513344 -0.138586 C1
20 -2.557531 -1.723377 C1
21 -1.389194 -0.284962 C2
22 -0.776032 0.221122 C2
23 -1.387943 -0.026513 C2
24 -1.510273 0.816104 C2
25 -1.674479 -0.207244 C2
26 -2.121766 -0.157644 C2
27 -1.904922 -0.492034 C2
28 -1.449933 -0.338953 C2
29 -2.153559 -0.146933 C2
30 -0.720952 -1.334875 C2
31 -1.437556 -0.511429 C3
32 1.096347 0.205410 C3
33 -0.275734 -0.427320 C3
34 -0.953779 -1.263496 C3
35 -1.889583 -0.929957 C3
36 -1.521789 -0.355877 C3
37 -1.967298 -1.285574 C3
38 -0.482127 -0.381481 C3
39 -1.073449 -0.345657 C3
40 6.792683 -1.586820 C4
41 6.136840 -0.290805 C4
42 4.890663 0.785053 C4
43 6.499080 -0.979607 C4
44 6.753821 -1.491083 C4
45 1.589010 0.369480 C4
46 5.351898 0.888101 C4
47 5.568847 -0.401511 C4
48 5.335149 0.515722 C4
49 6.345298 -1.304311 C4
Here are my functions and code:
def is_pos_def(A):
if np.allclose(A, A.T):
try:
np.linalg.cholesky(A)
return True
except np.linalg.LinAlgError:
return False
else:
return False
def cov_matrix(data, verbose=False):
# data = pd.DataFrame(data).to_numpy()
covariance_matrix = np.cov(data, rowvar=False)
# # fit a MCD robust estimator to data
# robust_cov = MinCovDet().fit(data)
# covariance_matrix = robust_cov.covariance_
# # fit a MLE estimator to data
# emp_cov = EmpiricalCovariance().fit(data)
# covariance_matrix = emp_cov.covariance_
if is_pos_def(covariance_matrix):
inv_covariance_matrix = np.linalg.inv(covariance_matrix)
if is_pos_def(inv_covariance_matrix):
return covariance_matrix, inv_covariance_matrix
else:
print("Error: Inverse of Covariance Matrix is not positive definite!")
else:
print("Error: Covariance Matrix is not positive definite!")
def MahalanobisDist(inv_cov_matrix, mean_distr, data, verbose=False):
inv_covariance_matrix = inv_cov_matrix
vars_mean = mean_distr
diff = data - vars_mean
md = []
# for i in range(len(diff)):
# md.append(np.sqrt(diff[i].dot(inv_covariance_matrix).dot(diff[i])))
md.append(np.sqrt(np.diag(np.linalg.multi_dot([diff, inv_covariance_matrix, diff.T]))))
return md
def MD_detectOutliers(dist, extreme=False, verbose=False):
k = 3. if extreme else 2.
threshold = np.mean(dist) * k
outliers = []
for i in range(len(dist)):
if dist[i] >= threshold:
outliers.append(i) # index of the outlier
return np.array(outliers)
def MD_threshold(dist, extreme=False, verbose=False):
k = 3. if extreme else 2.
threshold = np.mean(dist) * k
# # Tukeys method:
# PDF_Q1 = np.quantile(dist, 0.25)
# PDF_Q3 = np.quantile(dist, 0.75)
# IQR = PDF_Q3 - PDF_Q1
# threshold = PDF_Q3 + (1.5 * IQR)
# Assuming is Chi-distributed data:
# threshold = np.sqrt(st.chi2.ppf((1-(k/100)), df=np.array(dist).shape[0])) #degrees of freedom = number of variables
# # Assuming is norm-distributed data:
# k = 0.997 if extreme else 0.95
# params = st.norm.fit(dist)
# # Separate parts of parameters
# arg = params[:-2]
# loc = params[-2]
# scale = params[-1]
# # Get sane start and end points of distribution
# threshold = st.norm.ppf(k, *arg, loc=loc, scale=scale) if arg else dist.ppf(k, loc=loc, scale=scale)
# threshold = st.norm.ppf(k, loc=np.mean(dist), scale=np.sqrt(np.var(dist))) # loc = mean of distance training, scale = std of the distance training
# threshold = st.norm.ppf(0.997, loc=np.mean(dist), scale=np.sqrt(np.var(dist))) # loc = mean of distance training, scale = std of the distance training
# dist_pdf = sns.distplot(dist, bins = 10, kde= True, color = 'blue').get_lines()[0].get_data()
# threshold = np.quantile(dist_pdf, 0.95)
return threshold
class MahalanobisOneclassClassifier():
def __init__(self, X_train, threshold):
self.X_train = X_train
self.threshold = threshold
print('Critical value is: ', self.threshold)
def predict_proba(self, X_test):
mahalanobis_dist = X_test
return mahalanobis_dist
def predict(self, X_test):
# predict_lst = []
dist = self.predict_proba(X_test)
dist = dist.to_numpy()
dist = dist.flatten()
# print(dist.flatten())
predict_lst = [int(dist_val >= self.threshold) for dist_val in dist]
# for i in range(len(dist)):
# if dist[i] >= threshold:
# predict_lst.append(int(dist[i]))
return predict_lst
######## Anomaly Detection:
## Using Mahalanobis distance metric:
# Inputting the damage cases:
Case_0_LDA_dataframe = Case_0_LDA_dataframe.drop(['2'], axis=1) # df.columns is zero-based pd.Index
Case_1_LDA_dataframe = Case_1_LDA_dataframe.drop(['2'], axis=1) # df.columns is zero-based pd.Index
Case_2_LDA_dataframe = Case_2_LDA_dataframe.drop(['2'], axis=1) # df.columns is zero-based pd.Index
Case_3_LDA_dataframe = Case_3_LDA_dataframe.drop(['2'], axis=1) # df.columns is zero-based pd.Index
Case_4_LDA_dataframe = Case_4_LDA_dataframe.drop(['2'], axis=1) # df.columns is zero-based pd.Index
data_train = np.array(Case_0_LDA_dataframe.values)
data_test_C1 = np.array(Case_1_LDA_dataframe.values)
data_test_C2 = np.array(Case_2_LDA_dataframe.values)
data_test_C3 = np.array(Case_3_LDA_dataframe.values)
data_test_C4 = np.array(Case_4_LDA_dataframe.values)
data_train_df = pd.DataFrame(Case_0_LDA_dataframe.values)
data_test_df_C1 = pd.DataFrame(Case_1_LDA_dataframe.values)
data_test_df_C2 = pd.DataFrame(Case_2_LDA_dataframe.values)
data_test_df_C3 = pd.DataFrame(Case_3_LDA_dataframe.values)
data_test_df_C4 = pd.DataFrame(Case_4_LDA_dataframe.values)
# Calculating the covariance matrix:
covar_matrix, inv_covar_matrix = cov_matrix(data=data_train)
# Calculating the mean value for the input variables:
mean_distr = data_train_df.mean(axis=0)
# rob_cov = MinCovDet(random_state=0).fit(data_train_df)
# robust_mean = rob_cov.location_ #robust mean
# mean_distr = robust_mean
# Calculating the Mahalanobis distance and threshold value to flag datapoints as an anomaly:
dist_test_C1 = MahalanobisDist(inv_covar_matrix, mean_distr, data_test_df_C1, verbose=True)
dist_test_C2 = MahalanobisDist(inv_covar_matrix, mean_distr, data_test_df_C2, verbose=True)
dist_test_C3 = MahalanobisDist(inv_covar_matrix, mean_distr, data_test_df_C3, verbose=True)
dist_test_C4 = MahalanobisDist(inv_covar_matrix, mean_distr, data_test_df_C4, verbose=True)
dist_train = MahalanobisDist(inv_covar_matrix, mean_distr, data_train_df, verbose=True)
threshold = MD_threshold(dist_train, extreme = False)
# Distribution of Threshold value for flagging an anomaly:
plt.figure()
sns.distplot(np.square(dist_train),bins = 10, kde= False)
# plt.xlim([0.0,15])
plt.show()
plt.figure()
sns.distplot(dist_train, bins = 10, kde= True, color = 'green');
# plt.xlim([0.0,5])
plt.xlabel('Mahalanobis dist')
plt.show()
anomaly_train = pd.DataFrame(index=data_train_df.index)
anomaly_train['Mob_dist']= dist_train[0]
anomaly_train['Thresh'] = threshold
# If Mob_dist above threshold: Flag as anomaly
anomaly_train['Anomaly'] = anomaly_train['Mob_dist'] > anomaly_train['Thresh']
anomaly_train['Case'] = 'C0'
anomaly_train.index = data_train_df.index
anomaly_C1 = pd.DataFrame(index=data_test_df_C1.index)
anomaly_C1['Mob_dist']= dist_test_C1[0]
anomaly_C1['Thresh'] = threshold
# If Mob_dist above threshold: Flag as anomaly
anomaly_C1['Anomaly'] = anomaly_C1['Mob_dist'] > anomaly_C1['Thresh']
anomaly_C1['Case'] = 'C1'
anomaly_C1.index = data_test_df_C1.index
anomaly_C1.head()
anomaly_C2 = pd.DataFrame(index=data_test_df_C2.index)
anomaly_C2['Mob_dist']= dist_test_C2[0]
anomaly_C2['Thresh'] = threshold
# If Mob_dist above threshold: Flag as anomaly
anomaly_C2['Anomaly'] = anomaly_C2['Mob_dist'] > anomaly_C2['Thresh']
anomaly_C2['Case'] = 'C2'
anomaly_C2.index = data_test_df_C2.index
anomaly_C2.head()
anomaly_C3 = pd.DataFrame(index=data_test_df_C3.index)
anomaly_C3['Mob_dist']= dist_test_C3[0]
anomaly_C3['Thresh'] = threshold
# If Mob_dist above threshold: Flag as anomaly
anomaly_C3['Anomaly'] = anomaly_C3['Mob_dist'] > anomaly_C3['Thresh']
anomaly_C3['Case'] = 'C3'
anomaly_C3.index = data_test_df_C3.index
anomaly_C3.head()
anomaly_C4 = pd.DataFrame(index=data_test_df_C4.index)
anomaly_C4['Mob_dist']= dist_test_C4[0]
anomaly_C4['Thresh'] = threshold
# If Mob_dist above threshold: Flag as anomaly
anomaly_C4['Anomaly'] = anomaly_C4['Mob_dist'] > anomaly_C4['Thresh']
anomaly_C4['Case'] = 'C4'
anomaly_C4.index = data_test_df_C4.index
anomaly_C4.head()
final_scored_md = pd.concat([anomaly_train, anomaly_C1, anomaly_C2, anomaly_C3, anomaly_C4], ignore_index=True)
# final_scored_md = pd.concat([anomaly_train, anomaly_C1, anomaly_C2, anomaly_C3, anomaly_C4])
print(final_scored_md)
# Plotting the observation vs Mahalanobis distance:
final_scored_len = final_scored_md.shape
obser = np.arange(1, final_scored_len[0]+1)
dfc = final_scored_md.query('Mob_dist > Thresh')
# obser_dfc = np.arange(dfc.shape)
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Observation', fontsize = 15)
ax.set_ylabel('Mahalanobis distance', fontsize = 15)
ax.set_title('Mahalanobis distance plot of Example {0}'.format(sensor_no), fontsize = 20)
targets = ['C0', 'C1', 'C2', 'C3', 'C4']
colors = ['blue', 'yellow', 'green', 'cyan', 'purple']
final_scored_md_gp = final_scored_md.groupby("Case")
for name, group in final_scored_md_gp:
ax.scatter(group.index, group['Mob_dist'], s = 50, label=name)
ax.axhline(y=threshold, color='k', linestyle='--')
ax.scatter(dfc.index, dfc['Mob_dist'], c = 'red', s = 50, label='Anomaly')
ax.legend()
ax.grid()
fig.tight_layout()
plt.show()
######## Classification part:
# Separating MSD Damage cases:
Case_0_MSD = final_scored_md.loc[final_scored_md['Case'] == 'C0']
Case_1_MSD = final_scored_md.loc[final_scored_md['Case'] == 'C1']
Case_2_MSD = final_scored_md.loc[final_scored_md['Case'] == 'C2']
Case_3_MSD = final_scored_md.loc[final_scored_md['Case'] == 'C3']
Case_4_MSD = final_scored_md.loc[final_scored_md['Case'] == 'C4']
## Training dataframe:
# Case_0_MSD_train = Case_0_MSD.iloc[:7]
# Case_1_MSD_train = Case_1_MSD.iloc[:7]
# Case_2_MSD_train = Case_2_MSD.iloc[:7]
# Case_3_MSD_train = Case_3_MSD.iloc[:7]
# Case_4_MSD_train = Case_4_MSD.iloc[:7]
Case_0_MSD_train = Case_0_MSD.sample(n=7)
Case_1_MSD_train = Case_1_MSD.sample(n=7)
Case_2_MSD_train = Case_2_MSD.sample(n=7)
Case_3_MSD_train = Case_3_MSD.sample(n=7)
Case_4_MSD_train = Case_4_MSD.sample(n=7)
frames_train = [Case_0_MSD_train, Case_1_MSD_train, Case_2_MSD_train, Case_3_MSD_train, Case_4_MSD_train]
dataframe_train = pd.concat(frames_train)
## Testing dataframe:
# Case_0_MSD_test = Case_0_MSD.iloc[-3:]
# Case_1_MSD_test = Case_1_MSD.iloc[-3:]
# Case_2_MSD_test = Case_2_MSD.iloc[-3:]
# Case_3_MSD_test = Case_3_MSD.iloc[-3:]
# Case_4_MSD_test = Case_4_MSD.iloc[-3:]
Case_0_MSD_test = Case_0_MSD.sample(n=3)
Case_1_MSD_test = Case_1_MSD.sample(n=3)
Case_2_MSD_test = Case_2_MSD.sample(n=3)
Case_3_MSD_test = Case_3_MSD.sample(n=3)
Case_4_MSD_test = Case_4_MSD.sample(n=3)
frames_test = [Case_0_MSD_test, Case_1_MSD_test, Case_2_MSD_test, Case_3_MSD_test, Case_4_MSD_test]
dataframe_test = pd.concat(frames_test)
# Keeping MSD column in training dataset:
dataframe_train = dataframe_train.drop(['Case', 'Thresh'], axis=1) # df.columns is zero-based pd.Index
true_y_class_train = dataframe_train['Anomaly']
true_y_class_train = true_y_class_train.astype(int)
true_y_class_train = true_y_class_train.transpose()
true_y_class_train = true_y_class_train.to_numpy()
dataframe_train = dataframe_train.drop(['Anomaly'], axis=1) # df.columns is zero-based pd.Index
# Keeping MSD column in testing dataset:
dataframe_test = dataframe_test.drop(['Case', 'Thresh'], axis=1) # df.columns is zero-based pd.Index
true_y_class_test = dataframe_test['Anomaly']
true_y_class_test = true_y_class_test.astype(int)
true_y_class_test = true_y_class_test.transpose()
true_y_class_test = true_y_class_test.to_numpy()
dataframe_test = dataframe_test.drop(['Anomaly'], axis=1) # df.columns is zero-based pd.Index
clf = MahalanobisOneclassClassifier(dataframe_train, threshold)
mahalanobis_dist = clf.predict_proba(dataframe_test)
pred_mahalanobis_dist_class = clf.predict(dataframe_test)
print(mahalanobis_dist)
print(pred_mahalanobis_dist_class)
# Pred and Truth
test_acc = accuracy_score(true_y_class_test, pred_mahalanobis_dist_class) * 100
print('The test set accuracy is %4.2f%%' % test_acc)
# Obtaining the report of the model:
print('Report of MSD: ')
print(classification_report(y_true=true_y_class_test, y_pred=pred_mahalanobis_dist_class))
targets = ['0', '1']
cnf_matrix = confusion_matrix(y_true=true_y_class_test, y_pred=pred_mahalanobis_dist_class)
print('Confusion Matrix of MSD: ')
print(cnf_matrix)
# Obtaining number of labels:
labels = list(set(true_y_class_test))
labels.sort()
print("Total labels: %s -> %s" % (len(labels), labels))
# Obtaining the dataframe of the confusion matrix:
df_conf = pd.DataFrame(data=confusion_matrix(true_y_class_test, pred_mahalanobis_dist_class, labels=labels), columns=labels,index=labels)
print('Confusion Matrix Dataframe:')
print(df_conf)
# Local (metrics per class) #
tps = {}
fps = {}
fns = {}
precision_local = {}
recall_local = {}
f1_local = {}
accuracy_local = {}
for label in labels:
tps[label] = df_conf.loc[label, label]
fps[label] = df_conf[label].sum() - tps[label]
fns[label] = df_conf.loc[label].sum() - tps[label]
tp, fp, fn = tps[label], fps[label], fns[label]
precision_local[label] = tp / (tp + fp) if (tp + fp) > 0. else 0.
recall_local[label] = tp / (tp + fn) if (tp + fp) > 0. else 0.
p, r = precision_local[label], recall_local[label]
f1_local[label] = 2. * p * r / (p + r) if (p + r) > 0. else 0.
accuracy_local[label] = tp / (tp + fp + fn) if (tp + fp + fn) > 0. else 0.
print('\n')
print("#-- Local measures --#")
print("True Positives:", tps)
print("False Positives:", fps)
print("False Negatives:", fns)
print("Precision:", precision_local)
print("Recall:", recall_local)
print("F1-Score:", f1_local)
print("Accuracy:", accuracy_local)
# Global metrics #
micro_averages = {}
macro_averages = {}
correct_predictions = sum(tps.values())
den = sum(list(tps.values()) + list(fps.values()))
micro_averages["Precision"] = 1. * correct_predictions / den if den > 0. else 0.
den = sum(list(tps.values()) + list(fns.values()))
micro_averages["Recall"] = 1. * correct_predictions / den if den > 0. else 0.
micro_avg_p, micro_avg_r = micro_averages["Precision"], micro_averages["Recall"]
micro_averages["F1-score"] = 2. * micro_avg_p * micro_avg_r / (micro_avg_p + micro_avg_r) if (micro_avg_p + micro_avg_r) > 0. else 0.
macro_averages["Precision"] = np.mean(list(precision_local.values()))
macro_averages["Recall"] = np.mean(list(recall_local.values()))
macro_avg_p, macro_avg_r = macro_averages["Precision"], macro_averages["Recall"]
macro_averages["F1-Score"] = np.mean(list(f1_local.values()))
total_predictions = df_conf.values.sum()
accuracy_global = correct_predictions / total_predictions if total_predictions > 0. else 0.
print('\n')
print("#-- Global measures --#")
print("Micro-Averages:", micro_averages)
print("Macro-Averages:", macro_averages)
print("Correct predictions:", correct_predictions)
print("Total predictions:", total_predictions)
print("Accuracy:", accuracy_global * 100)
# TN (True Negative) #
tns = {}
for label in set(true_y_class_test):
tns[label] = len(true_y_class_test) - (tps[label] + fps[label] + fns[label])
print("True Negatives:", tns)
accuracy_local_new = {}
for label in labels:
tp, fp, fn, tn = tps[label], fps[label], fns[label], tns[label]
accuracy_local_new[label] = (tp + tn) / (tp + fp + fn + tn) if (tp + fp + fn + tn) > 0. else 0.
total_true = sum(list(tps.values()) + list(tns.values()))
total_predictions = sum(list(tps.values()) + list(tns.values()) + list(fps.values()) + list(fns.values()))
accuracy_global_new = 1. * total_true / total_predictions if total_predictions > 0. else 0.
print("Accuracy (per class), with TNs:", accuracy_local_new)
print("Accuracy (per class), without TNs:", accuracy_local)
print("Accuracy (global), with TNs:", accuracy_global_new)
print("Accuracy (global), without TNs:", accuracy_global)
print('\n')
fig_1, ax_1 = plot_confusion_matrix(conf_mat=cnf_matrix, colorbar=True, show_absolute=True, show_normed=False, class_names=targets)
plt.title('Confusion matrix of MSD Model of Example {0}'.format(sensor_no))
fig_2, ax_2 = plot_confusion_matrix(conf_mat=cnf_matrix, colorbar=True, show_absolute=False, show_normed=True, class_names=targets)
plt.title('Normalized MSD confusion matrix of Example {0}'.format(sensor_no))
# plt.show()
plt.show(block=False)
plt.pause(1)
plt.close('all')
So, the code works by using the functions above and the results are the following in pictures:
So, I would like to create a Mahalanobis Distance predictive model based on the functions in code above and use the confusion matrix and classification report from sklearn metrics to detect for anomaly. I was also wondering what would go into the fitting and predict functions? I tried the following code which contains the class of MSD Classifier:
class MahalanobisOneclassClassifier():
def __init__(self, X_train, threshold):
self.X_train = X_train
self.threshold = threshold
print('Critical value is: ', self.threshold)
def predict_proba(self, X_test):
mahalanobis_dist = X_test
return mahalanobis_dist
def predict(self, X_test):
# predict_lst = []
dist = self.predict_proba(X_test)
dist = dist.to_numpy()
dist = dist.flatten()
# print(dist.flatten())
predict_lst = [int(dist_val >= self.threshold) for dist_val in dist]
# for i in range(len(dist)):
# if dist[i] >= threshold:
# predict_lst.append(int(dist[i]))
return predict_lst
However, I keep on getting like 100% prediction everytime, whereas, in the image and creating coodrdinates of LDA, the other cases are far from Case 0. So, I would like to include the points in the cases below the threshold (other than case 0) to be like False negatives.
I worked on the model for some time and managed to narrow it down to the following code:
class MahalanobisOneclassClassifier():
def __init__(self, X_train, threshold):
self.X_train = X_train
self.threshold = threshold
print('Critical value is: ', self.threshold)
def predict_proba(self, X_test):
mahalanobis_dist = X_test
return mahalanobis_dist
def predict(self, X_test):
# To convert the data into numpy format:
dist = self.predict_proba(X_test)
dist = dist.transpose()
dist = dist.to_numpy()
# To implement the prediction of the MD model:
predict_lst = [int(dist_val >= self.threshold) for dist_val in dist[0]]
return predict_lst
However, to be able to include the points in the cases below the threshold (other than case 0) to be like False negatives, you should have like the true class output in order to compare the prediction with the true. This is further explained in the following code:
true_y_class_test = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
clf = MahalanobisOneclassClassifier(dataframe_train, threshold)
mahalanobis_dist = clf.predict_proba(dataframe_test)
pred_mahalanobis_dist_class = clf.predict(dataframe_test)
print(mahalanobis_dist)
print(pred_mahalanobis_dist_class)
# Pred and Truth
test_acc = accuracy_score(true_y_class_test, pred_mahalanobis_dist_class) * 100
print('The test set accuracy is %4.2f%%' % test_acc)
# Obtaining the report of the model:
print('Report of MSD: ')
print(classification_report(y_true=true_y_class_test, y_pred=pred_mahalanobis_dist_class))
targets = ['0', '1']
cnf_matrix = confusion_matrix(y_true=true_y_class_test, y_pred=pred_mahalanobis_dist_class)
print('Confusion Matrix of MSD: ')
print(cnf_matrix)
It is important to note that the list of the true class label is dependent on your case study.