Search code examples
pythontensorflowlstmanomaly-detection

Need help in LSTM Autoencoder - Anomaly detection


I am trying to do Anomaly detection with LSTM. I am able to plot all features with local and global anomaly but I am not able to print all anomaly values, datetime, loss, threshold and date together (like a table).

After calculating test and train MAE in the following way:

Y_train_pred = self.model.predict(self.X_train)
train_mae_loss = np.mean(np.abs(self.Y_train_pred - self.Y_train), axis=1)
Y_test_pred = self.model.predict(self.X_test)
test_mae_loss = np.mean(np.abs(self.Y_test_pred - self.Y_test), axis=1)
test = self.test[:len(Y_test_pred)]

I tried to make a table by matching date, loss, threshold, and anomaly in this way:

test_score_df = pd.DataFrame(index=self.test.index)
print(test_score_df)
test_score_df['loss'] = loss_mean_vec
test_score_df['threshold'] = threshold_mean_exp
test_score_df['anomaly'] = test_score_df.loss > test_score_df.threshold
test_score_df['dckw'] = self.test[:].dckw
anomalies = test_score_df[test_score_df.anomaly == True]
print(anomalies.head())

But it throws and error :

AttributeError: 'DataFrame' object has no attribute 'dckw' 

When I print self.test it has all the features with header datetimeAt, dckw ......

When I remove this line test_score_df['dckw'] = self.test[:].dckw

It gives me this result:

       loss  threshold  anomaly
0  0.106414   0.037134     True
1  0.107169   0.037134     True
2  0.107001   0.037134     True
3  0.105836   0.037134     True
4  0.103779   0.037134     True

So how can I get the last table with datetime and other features which are in csv file so that I can plot datetime and see when was the anomaly appeared?

My code and files are quite heavy so I uploaded them in git hub: https://github.com/Laudarisd/test.git

print(self.test) gives me this output:

                           dckw   ackw  dayTotal  wind_spd  temp    pres
datetimeAt                                                                
2021-12-08 19:00:00  1.880145e-39  0.000      70.0       0.5   3.5  1027.6
2021-12-08 19:00:00  1.875275e-39  0.000      70.8       0.5   3.5  1027.6
2021-12-08 19:00:00  1.879741e-39  0.000      68.9       0.5   3.5  1027.6
2021-12-08 19:00:00  1.881514e-39  0.000      69.8       0.5   3.5  1027.6
2021-12-08 20:00:00  1.881775e-39  0.000      69.9       1.0   3.1  1027.6

Code looks like this:


197 lines (166 sloc)  7.99 KB
  
from os import path
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, RepeatVector, TimeDistributed, Activation
import matplotlib.pyplot as plt
import seaborn as sns

TIME_STEPS = 30
ALPHA = 0.9
DATA_POINT_TO_PREDICT = 3

def Data():
    dataset = pd.read_csv('./combined.csv')
    dataset = dataset.fillna(0)
    #dates = dataset['datetimeAt']
    dataset = dataset.drop(columns = ['invno', 'ts'])
    dataset = dataset.set_index('datetimeAt')
    return dataset

#print(Data())

class AutoEncoder:
    def __init__(self):
        self.data = Data()
        print(self.data.shape)
    def create_dataset(self, X, y, time_steps=1):
        Xs, ys = [], []
        for i in range(len(X) - time_steps):
            v = X.iloc[i:(i + time_steps)].values
            Xs.append(v)
            u = y.iloc[i:(i + time_steps)].values
            ys.append(u)
        return np.array(Xs), np.array(ys)

    def split_train_test(self, test_size=0.2):
        df = self.data
        train_size = int(len(df) * (1 - test_size))
        self.train, self.test = df.iloc[0:train_size], df.iloc[train_size:len(df)]
        #print(self.test)
        #index=self.test
        #print(index)
    def split_X_Y(self, data_point_to_predict=0):
        self.X_train, self.Y_train = self.create_dataset(self.train, self.train, TIME_STEPS)
        self.X_test, self.Y_test = self.create_dataset(self.test, self.test, TIME_STEPS)
        if (data_point_to_predict > 0):
            #print(self.X_train)
            self.X_train = self.X_train[slice(None, self.X_train.shape[0] - data_point_to_predict)]
            #print(self.X_train)
            self.X_test = self.X_test[slice(None, self.X_test.shape[0] - data_point_to_predict)]
            #print(self.Y_train)
            self.Y_train = self.Y_train[slice(data_point_to_predict, None)]
            #print(self.Y_train)
            self.Y_test = self.Y_test[slice(data_point_to_predict,


     def normalize(self):
          scaler = MinMaxScaler().fit(self.train)
          self.train = pd.DataFrame(scaler.transform(self.train))
          self.test = pd.DataFrame(scaler.transform(self.test))

Solution

  • The error is due to the fact that this step

    def normalize(self):
        scaler = MinMaxScaler().fit(self.train)
        self.train = pd.DataFrame(scaler.transform(self.train))
        self.test = pd.DataFrame(scaler.transform(self.test))
    

    removes both the index and the column names from self.train and self.test. To resolve this issue you need to update the code as follows:

    self.train = pd.DataFrame(
        data=scaler.transform(self.train),
        columns=self.train.columns,
        index=self.train.index
    )
    
    self.test = pd.DataFrame(
        data=scaler.transform(self.test),
        columns=self.test.columns,
        index=self.test.index
    )
    

    After that you also need to update the definition of the anomalies data frame as follows:

    test_score_df = pd.DataFrame(index=self.test.index)
    test_score_df['loss'] = np.append(np.zeros(DATA_POINT_TO_PREDICT + TIME_STEPS), loss_mean_vec.values)
    test_score_df['threshold'] = threshold_mean_exp
    test_score_df['anomaly'] = test_score_df.loss > test_score_df.threshold
    test_score_df['dckw'] = self.test[:].dckw
    test_score_df.index = pd.DatetimeIndex(test_score_df.index)
    anomalies = test_score_df[test_score_df.anomaly == True]
    
    print(anomalies.head())
                             loss  threshold  anomaly  dckw
    # datetimeAt                                             
    # 2021-12-09 01:00:00  0.111500   0.037009     True   0.0
    # 2021-12-09 02:00:00  0.113632   0.037009     True   0.0
    # 2021-12-09 02:00:00  0.115057   0.037009     True   0.0
    # 2021-12-09 02:00:00  0.115312   0.037009     True   0.0
    # 2021-12-09 02:00:00  0.114501   0.037009     True   0.0
    

    given that you don't have the loss for the first DATA_POINT_TO_PREDICT + TIME_STEPS data points in the test set. Once this is done you can plot the results:

    plt.plot(test_score_df.index, test_score_df['dckw'].values, color='black')
    plt.scatter(anomalies.index, anomalies['dckw'].values, color='red')
    plt.show()
    

    enter image description here