I am trying to do Anomaly detection with LSTM. I am able to plot all features with local and global anomaly but I am not able to print all anomaly values, datetime, loss, threshold and date together (like a table).
After calculating test and train MAE in the following way:
Y_train_pred = self.model.predict(self.X_train)
train_mae_loss = np.mean(np.abs(self.Y_train_pred - self.Y_train), axis=1)
Y_test_pred = self.model.predict(self.X_test)
test_mae_loss = np.mean(np.abs(self.Y_test_pred - self.Y_test), axis=1)
test = self.test[:len(Y_test_pred)]
I tried to make a table by matching date, loss, threshold, and anomaly in this way:
test_score_df = pd.DataFrame(index=self.test.index)
print(test_score_df)
test_score_df['loss'] = loss_mean_vec
test_score_df['threshold'] = threshold_mean_exp
test_score_df['anomaly'] = test_score_df.loss > test_score_df.threshold
test_score_df['dckw'] = self.test[:].dckw
anomalies = test_score_df[test_score_df.anomaly == True]
print(anomalies.head())
But it throws and error :
AttributeError: 'DataFrame' object has no attribute 'dckw'
When I print self.test it has all the features with header datetimeAt, dckw ......
When I remove this line test_score_df['dckw'] = self.test[:].dckw
It gives me this result:
loss threshold anomaly
0 0.106414 0.037134 True
1 0.107169 0.037134 True
2 0.107001 0.037134 True
3 0.105836 0.037134 True
4 0.103779 0.037134 True
So how can I get the last table with datetime and other features which are in csv file so that I can plot datetime and see when was the anomaly appeared?
My code and files are quite heavy so I uploaded them in git hub: https://github.com/Laudarisd/test.git
print(self.test) gives me this output:
dckw ackw dayTotal wind_spd temp pres
datetimeAt
2021-12-08 19:00:00 1.880145e-39 0.000 70.0 0.5 3.5 1027.6
2021-12-08 19:00:00 1.875275e-39 0.000 70.8 0.5 3.5 1027.6
2021-12-08 19:00:00 1.879741e-39 0.000 68.9 0.5 3.5 1027.6
2021-12-08 19:00:00 1.881514e-39 0.000 69.8 0.5 3.5 1027.6
2021-12-08 20:00:00 1.881775e-39 0.000 69.9 1.0 3.1 1027.6
Code looks like this:
197 lines (166 sloc) 7.99 KB
from os import path
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, RepeatVector, TimeDistributed, Activation
import matplotlib.pyplot as plt
import seaborn as sns
TIME_STEPS = 30
ALPHA = 0.9
DATA_POINT_TO_PREDICT = 3
def Data():
dataset = pd.read_csv('./combined.csv')
dataset = dataset.fillna(0)
#dates = dataset['datetimeAt']
dataset = dataset.drop(columns = ['invno', 'ts'])
dataset = dataset.set_index('datetimeAt')
return dataset
#print(Data())
class AutoEncoder:
def __init__(self):
self.data = Data()
print(self.data.shape)
def create_dataset(self, X, y, time_steps=1):
Xs, ys = [], []
for i in range(len(X) - time_steps):
v = X.iloc[i:(i + time_steps)].values
Xs.append(v)
u = y.iloc[i:(i + time_steps)].values
ys.append(u)
return np.array(Xs), np.array(ys)
def split_train_test(self, test_size=0.2):
df = self.data
train_size = int(len(df) * (1 - test_size))
self.train, self.test = df.iloc[0:train_size], df.iloc[train_size:len(df)]
#print(self.test)
#index=self.test
#print(index)
def split_X_Y(self, data_point_to_predict=0):
self.X_train, self.Y_train = self.create_dataset(self.train, self.train, TIME_STEPS)
self.X_test, self.Y_test = self.create_dataset(self.test, self.test, TIME_STEPS)
if (data_point_to_predict > 0):
#print(self.X_train)
self.X_train = self.X_train[slice(None, self.X_train.shape[0] - data_point_to_predict)]
#print(self.X_train)
self.X_test = self.X_test[slice(None, self.X_test.shape[0] - data_point_to_predict)]
#print(self.Y_train)
self.Y_train = self.Y_train[slice(data_point_to_predict, None)]
#print(self.Y_train)
self.Y_test = self.Y_test[slice(data_point_to_predict,
def normalize(self):
scaler = MinMaxScaler().fit(self.train)
self.train = pd.DataFrame(scaler.transform(self.train))
self.test = pd.DataFrame(scaler.transform(self.test))
The error is due to the fact that this step
def normalize(self):
scaler = MinMaxScaler().fit(self.train)
self.train = pd.DataFrame(scaler.transform(self.train))
self.test = pd.DataFrame(scaler.transform(self.test))
removes both the index and the column names from self.train
and self.test
. To resolve this issue you need to update the code as follows:
self.train = pd.DataFrame(
data=scaler.transform(self.train),
columns=self.train.columns,
index=self.train.index
)
self.test = pd.DataFrame(
data=scaler.transform(self.test),
columns=self.test.columns,
index=self.test.index
)
After that you also need to update the definition of the anomalies
data frame as follows:
test_score_df = pd.DataFrame(index=self.test.index)
test_score_df['loss'] = np.append(np.zeros(DATA_POINT_TO_PREDICT + TIME_STEPS), loss_mean_vec.values)
test_score_df['threshold'] = threshold_mean_exp
test_score_df['anomaly'] = test_score_df.loss > test_score_df.threshold
test_score_df['dckw'] = self.test[:].dckw
test_score_df.index = pd.DatetimeIndex(test_score_df.index)
anomalies = test_score_df[test_score_df.anomaly == True]
print(anomalies.head())
loss threshold anomaly dckw
# datetimeAt
# 2021-12-09 01:00:00 0.111500 0.037009 True 0.0
# 2021-12-09 02:00:00 0.113632 0.037009 True 0.0
# 2021-12-09 02:00:00 0.115057 0.037009 True 0.0
# 2021-12-09 02:00:00 0.115312 0.037009 True 0.0
# 2021-12-09 02:00:00 0.114501 0.037009 True 0.0
given that you don't have the loss for the first DATA_POINT_TO_PREDICT + TIME_STEPS
data points in the test set. Once this is done you can plot the results:
plt.plot(test_score_df.index, test_score_df['dckw'].values, color='black')
plt.scatter(anomalies.index, anomalies['dckw'].values, color='red')
plt.show()