Why does Gekko_NN_TF add so many degrees of freedom?

I am using Gekko for trajectory optimisation. The dynamics of my system are being modelled by a NN and wrapped for Gekko. In the minimal example below I am approximating a sinusoid with the NN. Using the NN I have 1700 degrees of freedom whilst replacing it with m.sin(ctrl) leaves me with 0 DOF.

I would have assumed that the NN is immutable throughout the optimisation, why are so many degrees of freedom added?

Here is a minimal example:

import tensorflow as tf
import numpy as np
import pandas as pd

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from gekko import GEKKO
from gekko.ML import Gekko_NN_TF, CustomMinMaxGekkoScaler



# generate some random sinuisodal data
X = np.linspace(0, 2*np.pi, 1000)
y = np.sin(X) + np.random.normal(0, 0.1, 1000)


# model the data with a neural network
model = Sequential([
    Dense(16, activation='relu', input_shape=(1,)),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse', metrics=['mse', 'mae'])


# normalise the data
X = pd.DataFrame(np.hstack([X.reshape(-1, 1), y.reshape(-1, 1)]), columns=['x', 'y'])

scaler = CustomMinMaxGekkoScaler(X, ['x'], ['y'])


history = model.fit(scaler.newdataScaled['x'].values, scaler.newdataScaled['y'].values, epochs=100, batch_size=32, verbose=0)

m = GEKKO(remote=True)

# options
m.options.NODES = 2
m.options.SOLVER = 3
m.options.IMODE = 6
m.options.MAX_ITER = 1000

m.time = np.linspace(0, 1, 101)

tf = m.FV(value=1, lb=0.1, ub=10)
tf.STATUS = 1

# wrap the model in a Gekko_NN_TF object
wrapped_model = Gekko_NN_TF(model, scaler.minMaxValues(), m, n_output=1, activationFxn='relu')

# set up state variables
x = m.CV(value=0)
u = m.CV(value=0)

# set up control variable
ctrl = m.MV(value=0, lb=0, ub=2*np.pi)
ctrl.STATUS = 1

# add equations of motion
m.Equation(x.dt() == tf * u)
m.Equation(u.dt() == tf * wrapped_model.predict([ctrl]))

# add endpoint constraint
waypoint = m.CV(value=0)
# if we've reached the waypoint, change it to 1
m.Equation(waypoint == m.if3(x - 1, 0, 1))

# fix the final value
m.fix_final(waypoint, val = 1)

# minimise final time
m.Obj(tf)

m.solve(GUI=True)

Thank you for your help.

Solution

The neural network adds degrees of freedom (1700) because of the time horizon (100 time steps) x 16 ReLU (Rectified Linear Unit) functions that each require an additional variable for the switching condition for the abs statement (not sure why the extra 100 variables though). Without the switching condition, the function would not be continuously differentiable for the gradient-based optimizer. If degrees of freedom are a problem, try using function that doesn't require additional degrees of freedom such as tanh activation functions. Most of the extra parts of the model are removed to highlight zero degrees of freedom when using tanh.

import tensorflow as tf
import numpy as np
import pandas as pd

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from gekko import GEKKO
from gekko.ML import Gekko_NN_TF, CustomMinMaxGekkoScaler

# generate some random sinuisodal data
X = np.linspace(0, 2*np.pi, 1000)
y = np.sin(X) + np.random.normal(0, 0.1, 1000)

# model the data with a neural network
model = Sequential([
    Dense(16, activation='tanh', input_shape=(1,)),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse', metrics=['mse', 'mae'])

# normalise the data
X = pd.DataFrame(np.hstack([X.reshape(-1, 1), y.reshape(-1, 1)]), columns=['x', 'y'])

scaler = CustomMinMaxGekkoScaler(X, ['x'], ['y'])

history = model.fit(scaler.newdataScaled['x'].values, scaler.newdataScaled['y'].values, epochs=100, batch_size=32, verbose=0)

m = GEKKO(remote=True)

# options
m.options.NODES = 2
m.options.SOLVER = 3
m.options.IMODE = 6
m.options.MAX_ITER = 1000

m.time = np.linspace(0, 1, 101)

tf = m.FV(value=1, lb=0.1, ub=10)
tf.STATUS = 0

# wrap the model in a Gekko_NN_TF object
wrapped_model = Gekko_NN_TF(model, scaler.minMaxValues(), m, n_output=1, activationFxn='tanh')

# set up state variables
#x = m.Var(value=0)
u = m.Var(value=0)

# set up control variable
ctrl = m.MV(value=0, lb=0, ub=2*np.pi)
ctrl.STATUS = 0

# add equations of motion
#m.Equation(x.dt() == tf * u)
m.Equation(u.dt() == tf * wrapped_model.predict([ctrl]))

# add endpoint constraint
#waypoint = m.CV(value=0)
# if we've reached the waypoint, change it to 1
#m.Equation(waypoint == m.if3(x - 1, 0, 1))

# fix the final value
#m.fix_final(waypoint, val = 1)

# minimise final time
#m.Obj(tf)

m.solve()

To reduce the degrees of freedom for the full model, use m.Var() instead of m.CV(). Another option is to use m.CV() with m.options.CV_TYPE=2 for a squared error instead of an l1-norm error that also adds additional degrees of freedom.

The number of variables and equations in the implementation of the neural network can also be reduced by switching to Intermediate() declarations, similar to this example:

from gekko import GEKKO
import numpy as np
import matplotlib.pyplot as plt  

# generate training data
x = np.linspace(0.0,2*np.pi,20)
y = np.sin(x)

# option for fitting function
select = True # True / False
if select:
    # Size with cosine function
    nin = 1  # inputs
    n1 = 1   # hidden layer 1 (linear)
    n2 = 1   # hidden layer 2 (nonlinear)
    n3 = 1   # hidden layer 3 (linear)
    nout = 1 # outputs
else:
    # Size with hyperbolic tangent function
    nin = 1  # inputs
    n1 = 2   # hidden layer 1 (linear)
    n2 = 2   # hidden layer 2 (nonlinear)
    n3 = 2   # hidden layer 3 (linear)
    nout = 1 # outputs

# Initialize gekko
train = GEKKO() 
test = GEKKO()

model = [train,test]

for m in model:
    # input(s)
    m.inpt = m.Param()

    # layer 1
    m.w1 = m.Array(m.FV, (nin,n1))
    m.l1 = [m.Intermediate(m.w1[0,i]*m.inpt) for i in range(n1)]

    # layer 2
    m.w2a = m.Array(m.FV, (n1,n2))
    m.w2b = m.Array(m.FV, (n1,n2))
    if select:
        m.l2 = [m.Intermediate(sum([m.cos(m.w2a[j,i]+m.w2b[j,i]*m.l1[j]) \
                                for j in range(n1)])) for i in range(n2)]
    else:
        m.l2 = [m.Intermediate(sum([m.tanh(m.w2a[j,i]+m.w2b[j,i]*m.l1[j]) \
                                for j in range(n1)])) for i in range(n2)]

    # layer 3
    m.w3 = m.Array(m.FV, (n2,n3))
    m.l3 = [m.Intermediate(sum([m.w3[j,i]*m.l2[j] \
            for j in range(n2)])) for i in range(n3)]

    # output(s)
    m.outpt = m.CV()
    m.Equation(m.outpt==sum([m.l3[i] for i in range(n3)]))

    # flatten matrices
    m.w1 = m.w1.flatten()
    m.w2a = m.w2a.flatten()
    m.w2b = m.w2b.flatten()
    m.w3 = m.w3.flatten()

# Fit parameter weights
m = train
m.inpt.value=x
m.outpt.value=y
m.outpt.FSTATUS = 1
for i in range(len(m.w1)):
    m.w1[i].FSTATUS=1
    m.w1[i].STATUS=1
    m.w1[i].MEAS=1.0
for i in range(len(m.w2a)):
    m.w2a[i].STATUS=1
    m.w2b[i].STATUS=1
    m.w2a[i].FSTATUS=1
    m.w2b[i].FSTATUS=1
    m.w2a[i].MEAS=1.0
    m.w2b[i].MEAS=0.5
for i in range(len(m.w3)):
    m.w3[i].FSTATUS=1
    m.w3[i].STATUS=1
    m.w3[i].MEAS=1.0
m.options.IMODE = 2
m.options.SOLVER = 3
m.options.EV_TYPE = 2
m.solve(disp=False)

# Test sample points
m = test
for i in range(len(m.w1)):
    m.w1[i].MEAS=train.w1[i].NEWVAL
    m.w1[i].FSTATUS = 1
    print('w1['+str(i)+']: '+str(m.w1[i].MEAS))
for i in range(len(m.w2a)):
    m.w2a[i].MEAS=train.w2a[i].NEWVAL
    m.w2b[i].MEAS=train.w2b[i].NEWVAL
    m.w2a[i].FSTATUS = 1
    m.w2b[i].FSTATUS = 1
    print('w2a['+str(i)+']: '+str(m.w2a[i].MEAS))
    print('w2b['+str(i)+']: '+str(m.w2b[i].MEAS))
for i in range(len(m.w3)):
    m.w3[i].MEAS=train.w3[i].NEWVAL
    m.w3[i].FSTATUS = 1
    print('w3['+str(i)+']: '+str(m.w3[i].MEAS))
m.inpt.value=np.linspace(-2*np.pi,4*np.pi,100)
m.options.IMODE = 2
m.options.SOLVER = 3
m.solve(disp=False)

plt.figure()
plt.plot(x,y,'bo',label='data')
plt.plot(test.inpt.value,test.outpt.value,'r-',label='predict')
plt.legend(loc='best')
plt.ylabel('y')
plt.xlabel('x')
plt.show()