Search code examples
pythontime-serieslstmpredictionencoder-decoder

How my LSTM model knows about testing data and simply cheats previous values/patterns?


I have Encoder-Decoder LSTM model that learns to predict 12 months data in advance, while looking back 12 months. If it helps at all, my dataset has around 10 years in total (120 months). I keep 8 years for training/validation, and 2 years for testing. My understanding is that my model does not have access to the testing data at the training time.

The puzzling thing is that my model predictions are simply a shift of previous points. But how did my model know the actual previous points at the time of prediction? I did not give the monthly values in the testing set to the model! If we say that it simply copies the previous point which you give as input, then I am saying that I am giving it 12 months with completely different values than the ones it predicts (so it does not copy the 12 months I am giving), but the forecasted values are shifts of actual ones (which have never been seen).

Below is an example:

enter image description here

enter image description here

My code source is from here:

Below is my code:

#train/test splitting
split_position=int(len(scaled_data)*0.8)# 8 years for training
train=scaled_data[0:split_position]
test=scaled_data[split_position:]
#print(train)
print('length of train=',len(train))
#print(test)
print('length of test=',len(test))




# split train and test data into yearly train/test sets (3d)[observation,year, month]
def split_data_yearly(train, test):
    # restructure into windows of yearly data
    train = array(split(train, len(train)/12))
    test = array(split(test, len(test)/12))
    return train, test
 
# evaluate one or more yearly forecasts against expected values
def evaluate_forecasts(actual, predicted):
    scores = list()
    # calculate an RMSE score for each day
    for i in range(actual.shape[1]):
        # calculate mse
        mse = mean_squared_error(actual[:, i], predicted[:, i])
        # calculate rmse
        rmse = math.sqrt(mse)
        # store
        scores.append(rmse)
    # calculate overall RMSE
    s = 0
    for row in range(actual.shape[0]):
        for col in range(actual.shape[1]):
            s += (actual[row, col] - predicted[row, col])**2
    score = math.sqrt(s / (actual.shape[0] * actual.shape[1]))


    ################plot prediction vs actual###############################
    predicted=predicted.reshape(predicted.shape[0],predicted.shape[1])
    jump=12
    inv_scores = list()
    for i in range(len(predicted)):

        sample_predicted = predicted[i,:]
        sample_actual=actual[i,:]


        #inverse normalization
        sample_predicted_inv= scaler.inverse_transform(sample_predicted.reshape(-1, 1))
        sample_actual_inv= scaler.inverse_transform(sample_actual.reshape(-1, 1))

        #print(sample_actual_inv)
        #print(data_sd[(split_position+(i*jump)-1):(split_position+(i*jump-1))+len(sample_actual_inv)])
        
        #inverse differencing
        s=numpy.array(smoothed).reshape(-1,1)
        sample_actual_inv=sample_actual_inv+s[(split_position+(i*jump)):(split_position+(i*jump))+len(sample_actual_inv)]
        sample_predicted_inv=sample_predicted_inv+s[(split_position+(i*jump)):(split_position+(i*jump))+len(sample_actual_inv)]

        months=['August-'+str(19+i),'September-'+str(19+i),'October-'+str(19+i),'November-'+str(19+i),'December-'+str(19+i),'January-'+str(20+i),'February-'+str(20+i),'March-'+str(20+i),'April-'+str(20+i),'May-'+str(20+i),'June-'+str(20+i),'July-'+str(20+i)]
        pyplot.plot( months,sample_actual_inv,'b-',label='Actual')
        pyplot.plot(months,sample_predicted_inv,'--', color="orange",label='Predicted')
        pyplot.legend()
        pyplot.xticks(rotation=25)
        pyplot.title('Encoder Decoder LSTM Prediction', y=1.08)
        pyplot.show()



        ################### determine RMSE after inversion ################################
        mse = mean_squared_error(sample_actual_inv, sample_predicted_inv)
        rmse = math.sqrt(mse)
        inv_scores.append(rmse)
    

    return score, scores,inv_scores
 
# summarize scores
def summarize_scores(name, score, scores):
    s_scores = ', '.join(['%.1f' % s for s in scores])
    print('%s: [%.3f] %s' % (name, score, s_scores))
 
# convert history into inputs and outputs
def to_supervised(train, n_input, n_out=12):
    # flatten data
    data = train.reshape((train.shape[0]*train.shape[1], train.shape[2]))
    X, y = list(), list()
    in_start = 0
    # step over the entire history one time step at a time
    for _ in range(len(data)):
        # define the end of the input sequence
        in_end = in_start + n_input
        out_end = in_end + n_out
        # ensure we have enough data for this instance
        if out_end <= len(data):
            X.append(data[in_start:in_end, :])
            y.append(data[in_end:out_end, 0])
        # move along one time step
        in_start += 1
    return array(X), array(y)
 
# train the model
def build_model(train, n_input):
    # prepare data
    train_x, train_y = to_supervised(train, n_input)

    #take portion for validation
    val_size=12;
    test_x,test_y=train_x[-val_size:], train_y[-val_size:]
    train_x,train_y=train_x[0:-val_size],train_y[0:-val_size]
    

    # define parameters
    verbose, epochs, batch_size = 1,25, 8
    n_timesteps, n_features, n_outputs = train_x.shape[1], train_x.shape[2], train_y.shape[1]
    # reshape output into [samples, timesteps, features]
    train_y = train_y.reshape((train_y.shape[0], train_y.shape[1], 1))
    # define model
    model = Sequential()
    model.add(LSTM(64, activation='relu', input_shape=(n_timesteps, n_features)))
    model.add(RepeatVector(n_outputs))
    model.add(LSTM(64, activation='relu', return_sequences=True))
    model.add(TimeDistributed(Dense(100, activation='relu')))
    model.add(TimeDistributed(Dense(1)))
    #sgd = optimizers.SGD(lr=0.004, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='mse', optimizer='adam')
    # fit network
    train_history= model.fit(train_x, train_y, epochs=epochs, batch_size=batch_size, validation_data=(test_x, test_y),verbose=verbose)
    
    loss = train_history.history['loss']
    val_loss = train_history.history['val_loss']
    pyplot.plot(loss)
    pyplot.plot(val_loss)
    pyplot.legend(['loss', 'val_loss'])
    pyplot.show()

    return model
 
# make a forecast
def forecast(model, history, n_input):
    # flatten data
    data = array(history)
    data = data.reshape((data.shape[0]*data.shape[1], data.shape[2]))
    # retrieve last observations for input data
    input_x = data[-n_input:, :]
    # reshape into [1, n_input, n]
    input_x = input_x.reshape((1, input_x.shape[0], input_x.shape[1]))
    # forecast the next year
    yhat = model.predict(input_x, verbose=0)
    # we only want the vector forecast
    yhat = yhat[0]
    return yhat
 
# evaluate a single model
def evaluate_model(train, test, n_input):
    # fit model
    model = build_model(train, n_input)
    # history is a list of yearly data
    history = [x for x in train]
    # walk-forward validation over each year
    predictions = list()
    for i in range(len(test)):
        # predict the year
        yhat_sequence = forecast(model, history, n_input)
        # store the predictions
        predictions.append(yhat_sequence)
        # get real observation and add to history for predicting the next year
        history.append(test[i,:])

    # evaluate predictions days for each year
    predictions = array(predictions)

    score, scores, inv_scores = evaluate_forecasts(test[:, :, 0], predictions)
    return score, scores,inv_scores
 
# split into train and test
train, test = split_data_yearly(train, test)


# evaluate model and get scores
n_input = 12
score, scores, inv_scores = evaluate_model(train, test, n_input)
# summarize scores
summarize_scores('lstm', score, scores)
print('RMSE score after inversion:',inv_scores)
# plot scores
months=['July','August','September','October','November','December','January','February','March','April','May','June']
#pyplot.plot(months, scores, marker='o', label='lstm')
#pyplot.show()

Solution

  • Differencing is the key here!

    After further investigation, I found out that my model produces values that is almost zero before differencing (not learning).....When I invert the differencing, I am adding zero to the actual value in the previous timestep, which results in the shifted pattern above.

    Therefore, I need to tune my LSTM model to make it learn or maybe remove the zeros part in the data itself since I have many of those.