Search code examples
pythonartificial-intelligenceforecastingpybrain

Making a correct ANN for forecasting


This is my first time using python, so I'm having lots of doubts.

I'm trying to make a simple ANN for forecasting in Pybrain. It is a 2 input-1 output net. The inputs are, in the first column has the years and the second column has the months of the year. The outputs are the normal rainfall, linked to each month.

I don't know how many things I am doing wrong, but when I plot the results, I'm having errors.

This is my code:

from pybrain.datasets import SupervisedDataSet
from pybrain.tools.shortcuts import buildNetwork
from pybrain.supervised.trainers import BackpropTrainer
from pybrain.tools.validation import ModuleValidator
from pybrain.structure import SigmoidLayer, LinearLayer,TanhLayer
from pybrain.utilities import percentError
import matplotlib.pyplot as plt
import numpy as np
import math

#----------------------------------------------------------------------------------------------------------------------
if __name__ == '__main__':

    ds = SupervisedDataSet(2,1)  

    input = np.loadtxt('entradas.csv', delimiter=',')

    output = np.loadtxt('salidas.csv', delimiter=',')

    for x in range(0, len(input)):
        ds.addSample(input[x], output[x])

    print (ds['input'])
    print ("Hay una serie de",len(ds['target']),"datos")
    #print(ds)

    # Definicion topologia de la Red Neuronal  

    n = buildNetwork(ds.indim,5,ds.outdim,recurrent=True,hiddenclass=SigmoidLayer)  
    #ENTRENAMIENTO DE LA RED NEURONAL

    trndata,partdata=ds.splitWithProportion(0.60)

    tstdata,validata=partdata.splitWithProportion(0.50)

    print ("Datos para Validacion:",len(validata))
    print("Datos para Test:", len(tstdata))
    print("Datos para Entrenamiento:", len(trndata))

    treinadorSupervisionado = BackpropTrainer(n, dataset=trndata,momentum=0.1,verbose=True,weightdecay=0.01) 

    numeroDeEpocasPorPunto = 100
    trnerr,valerr=treinadorSupervisionado.trainUntilConvergence(dataset=trndata,maxEpochs=numeroDeEpocasPorPunto)

    max_anno = input.max(axis=0)[0]  
    min_anno = input.min(axis=0)[0]
    max_precip = output.max()
    min_precip = output.min()

    print("El primer año de la serie temporal disponible es:", min_anno)
    print("El ultimo año de la serie temporal disponible es:", max_anno)
    print("La máxima precipitación registrada en la serie temporal es:", max_precip)
    print("La mínima precipitación registrada en la serie temporal es:", min_precip)

    fig1 = plt.figure()
    ax1 = fig1.add_subplot(111)
    plt.xlabel('número de épocas')  
    plt.ylabel(u'Error')  
    plt.plot(trnerr,'b',valerr,'r')
    plt.show()

    treinadorSupervisionado.trainOnDataset(trndata,50)
    print(treinadorSupervisionado.totalepochs)
    out=n.activateOnDataset(tstdata).argmax(axis=1)
    print(percentError(out,tstdata))

    out=n.activateOnDataset(tstdata)
    out=out.argmax(axis=1)
    salida=n.activateOnDataset(validata)
    salida=salida.argmax(axis=1)
    print(percentError(salida,validata))

    print ('Pesos finales:', n.params)

    #Parametros de la RNA:

    for mod in n.modules:
        print("Module:", mod.name)
        if mod.paramdim > 0:
            print("--parameters:", mod.params)
        for conn in n.connections[mod]:
            print("-connection to", conn.outmod.name)
            if conn.paramdim > 0:
                print("- parameters", conn.params)
        if hasattr(n, "recurrentConns"):
            print("Recurrent connections")
            for conn in n.recurrentConns:
                print("-", conn.inmod.name, " to", conn.outmod.name)
                if conn.paramdim > 0:
                    print("- parameters", conn.params)

And this is the plot I get after running the code:

Error vs Epochs

Where the blue line is the training error and the red line is the validation error.

This doesn't make any sense. I have searched other questions, but I still don't know why I'm having this result.

My desired result is to predict, for example, the rainfall for each month in the following years, for example for 2010 (the series go from 1851 until 2008).


Solution

  • After checking your dataset, I noticed that it's a time series data. Usually using the time (month and year) as features doesn't work well in this case.

    The most common architectures to predict time series are RNN and, its upgraded version, LSTM. There is a nice tutorial on LSTM using Keras in http://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/

    I tried to train an LSTM (based on the tutorial) using your dataset and got better looking validation loss trend:

    loss chart

    I trained LSTM (100 epoch) to predict a rainfall based on previous 12 months data:

    import numpy
    import matplotlib.pyplot as plt
    import pandas
    import math
    from keras.models import Sequential
    from keras.layers import Dense, LSTM, Dropout
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.metrics import mean_squared_error
    
    
    # convert an array of values into a dataset matrix
    def create_dataset(dataset, look_back=1):
        dataX, dataY = [], []
        for i in range(len(dataset) - look_back - 1):
            a = dataset[i:(i + look_back), 0]
            dataX.append(a)
            dataY.append(dataset[i + look_back, 0])
        return numpy.array(dataX), numpy.array(dataY)
    
    # load the dataset
    dataframe = pandas.read_csv('salidas.csv', usecols=[0], engine='python')
    dataset = dataframe.values
    dataset = dataset.astype('float32')
    
    # normalize the dataset
    scaler = MinMaxScaler(feature_range=(0, 1))
    dataset = scaler.fit_transform(dataset)
    
    # split into train and test sets
    train_size = int(len(dataset) * 0.67)
    test_size = len(dataset) - train_size
    train, test = dataset[0:train_size, :], dataset[train_size:len(dataset), :]
    
    # reshape into X=t and Y=t+1
    look_back = 12
    trainX, trainY = create_dataset(train, look_back)
    testX, testY = create_dataset(test, look_back)
    
    # reshape input to be [samples, time steps, features]
    trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
    testX = numpy.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
    
    # create and fit the LSTM network
    model = Sequential()
    model.add(LSTM(4, input_dim=look_back))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    history = model.fit(trainX, trainY, validation_split=0.33, nb_epoch=100, batch_size=1)
    
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()