Search code examples
pythonpandasdataframetensorflowkeras

How to predict list elements outside the bounds of a py dataframe?


I built a model based on LSTM and trained it to predict stock price changes during the day, where the unit of time is one second. The test data gives a result comparable to the real one, but I need to get a forecast for the future outside the existing range. What am I doing wrong?


        close = df['close']
        values = close.values
        values = values.reshape(-1, 1)

        training_scaler = MinMaxScaler(feature_range=(0, 1))

        testing_input = values
        testing_input = training_scaler.fit_transform(testing_input)
        testing = []
        for i in range(50, len(testing_input) + 50):
            testing.append(testing_input[i - 50:i][0])

        testing = np.array(testing)
        testing = np.reshape(testing, (testing.shape[0], testing.shape[1], 1))
        predict = model.predict(testing)
        predict = training_scaler.inverse_transform(predict)

        plt.plot(values, color='blue', label='Stock Price')
        plt.plot(predict, color='red', label='Predicted Stock Price')
        plt.title('Changes')
        plt.xlabel('Timeline')
        plt.ylabel('Stock Price')
        plt.legend()
        plt.show()

My results

It turns out that the model predicts data that I already know. How can I predict future data?


Solution

  • I found a solution. The problem was that I had trained the model incorrectly and it was unable to predict data outside the set. The code below works correctly:

    def learn(self, dataset_path: str) -> Sequential:
        df = pd.read_csv(dataset_path)
        y = df['close'].fillna(method='ffill')
        y = y.values.reshape(-1, 1)
    
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaler = scaler.fit(y)
        y = scaler.transform(y)
    
        n_lookback = int(len(y) * 0.24)
        n_forecast = int(len(y) * 0.12)
    
        X = []
        Y = []
    
        for i in range(n_lookback, len(y) - n_forecast + 1):
            X.append(y[i - n_lookback: i])
            Y.append(y[i: i + n_forecast])
    
        X = np.array(X, dtype=np.float16)
        Y = np.array(Y, dtype=np.float16)
    
        model = Sequential()
        model.add(LSTM(units=50, return_sequences=True, input_shape=(n_lookback, 1)))
        model.add(LSTM(units=50))
        model.add(Dense(n_forecast))
    
        model.compile(loss='mean_squared_error', optimizer='adam')
        model.fit(X, Y, epochs=30, batch_size=128)
    
        return model
    

    Call predict method:

    def predict(self, model: Sequential, df: pd.DataFrame) -> pd.DataFrame:
        y = df['close'].fillna(method='ffill')
        y = y.values.reshape(-1, 1)
    
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaler = scaler.fit(y)
        y = scaler.transform(y)
    
        n_lookback = int(len(y) * 0.24)
        n_forecast = int(len(y) * 0.12)
    
        X_ = y[- n_lookback:]
        X_ = X_.reshape(1, n_lookback, 1)
    
        Y_ = model.predict(X_).reshape(-1, 1)
        Y_ = scaler.inverse_transform(Y_)
    
        timestamp_step = 1_000_000
    
        df_future = pd.DataFrame(columns=['unix', 'Forecast'])
        unix_range = np.array(
            range(int(df['unix'].iloc[0] / timestamp_step), int(df['unix'].iloc[-1] / timestamp_step) + 1)
        )
        df_future['unix'] = np.array(range(unix_range[-1], (unix_range[-1] + n_forecast) - 1))
        df_future['Forecast'] = pd.Series(Y_.flatten())
    
        return df_future[df_future['Forecast'].notna()]