I have been unable to get reasonable performance using mxnet
LinearRegressionOutput
layer.
The self-contained example below attempts to perform regression of a simple polynomial function (y = x1 + x2^2 + x3^3
) with a small amount of random noise thrown in.
The mxnet regression example given here is used, along with a slightly more complex network which includes a hidden layer.
The example below also trains regression networks using the neuralnet
and nnet
packages, which as can be seen from the plots perform much better.
I realize the answer to poorly performing networks is to do some hyper-parameter tuning, however I have tried a range of values without any improvement in performance. So I have the following questions:
My set-up as follows:
MXNet version: 0.7
R `sessionInfo()`: R version 3.3.2 (2016-10-31)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1
Poor regression results of mxnet
:
From this reproducible example:
## SIMPLE REGRESSION PROBLEM
# Check mxnet out-of-the-box performance VS neuralnet, and caret/nnet
library(mxnet)
library(neuralnet)
library(nnet)
library(caret)
library(tictoc)
library(reshape)
# Data definitions
nObservations <- 1000
noiseLvl <- 0.1
# Network config
nHidden <- 3
learnRate <- 2e-6
momentum <- 0.9
batchSize <- 20
nRound <- 1000
verbose <- FALSE
array.layout = "rowmajor"
# GENERATE DATA:
df <- data.frame(x1=runif(nObservations),
x2=runif(nObservations),
x3=runif(nObservations))
df$y <- df$x1 + df$x2^2 + df$x3^3 + noiseLvl*runif(nObservations)
# normalize data columns
# df <- scale(df)
# Seperate data into train/test
test.ind = seq(1, nObservations, 10) # 1 in 10 samples for testing
train.x = data.matrix(df[-test.ind, -which(colnames(df) %in% c("y"))])
train.y = df[-test.ind, "y"]
test.x = data.matrix(df[test.ind, -which(colnames(df) %in% c("y"))])
test.y = df[test.ind, "y"]
# Define mxnet network, following 5-minute regression example from here:
# http://mxnet-tqchen.readthedocs.io/en/latest//packages/r/fiveMinutesNeuralNetwork.html#regression
data <- mx.symbol.Variable("data")
label <- mx.symbol.Variable("label")
fc1 <- mx.symbol.FullyConnected(data, num_hidden=1, name="fc1")
lro1 <- mx.symbol.LinearRegressionOutput(data=fc1, label=label, name="lro")
# Train MXNET model
mx.set.seed(0)
tic("mxnet training 1")
mxModel1 <- mx.model.FeedForward.create(lro1, X=train.x, y=train.y,
eval.data=list(data=test.x, label=test.y),
ctx=mx.cpu(), num.round=nRound,
array.batch.size=batchSize,
learning.rate=learnRate, momentum=momentum,
eval.metric=mx.metric.rmse,
verbose=FALSE, array.layout=array.layout)
toc()
# Train network with a hidden layer
fc1 <- mx.symbol.FullyConnected(data, num_hidden=nHidden, name="fc1")
tanh1 <- mx.symbol.Activation(fc1, act_type="tanh", name="tanh1")
fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden=1, name="fc2")
lro2 <- mx.symbol.LinearRegressionOutput(data=fc2, label=label, name="lro")
tic("mxnet training 2")
mxModel2 <- mx.model.FeedForward.create(lro2, X=train.x, y=train.y,
eval.data=list(data=test.x, label=test.y),
ctx=mx.cpu(), num.round=nRound,
array.batch.size=batchSize,
learning.rate=learnRate, momentum=momentum,
eval.metric=mx.metric.rmse,
verbose=FALSE, array.layout=array.layout)
toc()
# Train neuralnet model
mx.set.seed(0)
tic("neuralnet training")
nnModel <- neuralnet(y~x1+x2+x3, data=df[-test.ind, ], hidden=c(nHidden),
linear.output=TRUE, stepmax=1e6)
toc()
# Train caret model
mx.set.seed(0)
tic("nnet training")
nnetModel <- nnet(y~x1+x2+x3, data=df[-test.ind, ], size=nHidden, trace=F,
linout=TRUE)
toc()
# Check response VS targets on training data:
par(mfrow=c(2,2))
plot(train.y, compute(nnModel, train.x)$net.result,
main="neuralnet Train Fitting Fake Data", xlab="Target", ylab="Response")
abline(0,1, col="red")
plot(train.y, predict(nnetModel, train.x),
main="nnet Train Fitting Fake Data", xlab="Target", ylab="Response")
abline(0,1, col="red")
plot(train.y, predict(mxModel1, train.x, array.layout=array.layout),
main="MXNET (no hidden) Train Fitting Fake Data", xlab="Target",
ylab="Response")
abline(0,1, col="red")
plot(train.y, predict(mxModel2, train.x, array.layout=array.layout),
main="MXNET (with hidden) Train Fitting Fake Data", xlab="Target",
ylab="Response")
abline(0,1, col="red")
I asked the same question in the mxnet
github (link), and uzhao was kind enough to suggest using a different optimization approach, so credit goes to them.
Using the "rmsprop" optimizer, as well as increasing the batch size enabled mxnet
to deliver performance comparable to the neuralnet
and nnet
tools on this simple regression task. I have also included the performance of a linear lm
regression as well.
Results and self-contained example code included below. I hope this is of help to someone else (or myself in the future).
Root-mean-square errors of the 5 models:
$mxModel1
[1] 0.1404579862
$mxModel2
[1] 0.03263213499
$nnet
[1] 0.03222651138
$neuralnet
[1] 0.03054112057
$linearModel
[1] 0.1404421006
Plots showing the good/reasonable performance of mxnet regression (and linear regression results in green):
And finally the code for this self-contained example:
## SIMPLE REGRESSION PROBLEM
# Check mxnet out-of-the-box performance VS neuralnet, and caret/nnet
library(mxnet)
library(neuralnet)
library(nnet)
library(caret)
library(tictoc)
library(reshape)
# Data definitions
nObservations <- 1000
noiseLvl <- 0.1
# Network config
nHidden <- 3
batchSize <- 100
nRound <- 400
verbose <- FALSE
array.layout = "rowmajor"
optimizer <- "rmsprop"
# GENERATE DATA:
set.seed(0)
df <- data.frame(x1=runif(nObservations),
x2=runif(nObservations),
x3=runif(nObservations))
df$y <- df$x1 + df$x2^2 + df$x3^3 + noiseLvl*runif(nObservations)
# normalize data columns
# df <- scale(df)
# Seperate data into train/test
test.ind = seq(1, nObservations, 10) # 1 in 10 samples for testing
train.x = data.matrix(df[-test.ind, -which(colnames(df) %in% c("y"))])
train.y = df[-test.ind, "y"]
test.x = data.matrix(df[test.ind, -which(colnames(df) %in% c("y"))])
test.y = df[test.ind, "y"]
# Define mxnet network, following 5-minute regression example from here:
# http://mxnet-tqchen.readthedocs.io/en/latest//packages/r/fiveMinutesNeuralNetwork.html#regression
data <- mx.symbol.Variable("data")
label <- mx.symbol.Variable("label")
fc1 <- mx.symbol.FullyConnected(data, num_hidden=1, name="fc1")
lro1 <- mx.symbol.LinearRegressionOutput(data=fc1, label=label, name="lro")
# Train MXNET model
mx.set.seed(0)
tic("mxnet training 1")
mxModel1 <- mx.model.FeedForward.create(lro1, X=train.x, y=train.y,
eval.data=list(data=test.x, label=test.y),
ctx=mx.cpu(), num.round=nRound,
array.batch.size=batchSize,
eval.metric=mx.metric.rmse,
verbose=verbose,
array.layout=array.layout,
optimizer=optimizer
)
toc()
# Train network with a hidden layer
fc1 <- mx.symbol.FullyConnected(data, num_hidden=nHidden, name="fc1")
tanh1 <- mx.symbol.Activation(fc1, act_type="tanh", name="tanh1")
fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden=1, name="fc2")
lro2 <- mx.symbol.LinearRegressionOutput(data=fc2, label=label, name="lro2")
tic("mxnet training 2")
mx.set.seed(0)
mxModel2 <- mx.model.FeedForward.create(lro2, X=train.x, y=train.y,
eval.data=list(data=test.x, label=test.y),
ctx=mx.cpu(), num.round=nRound,
array.batch.size=batchSize,
eval.metric=mx.metric.rmse,
verbose=verbose,
array.layout=array.layout,
optimizer=optimizer
)
toc()
# Train neuralnet model
set.seed(0)
tic("neuralnet training")
nnModel <- neuralnet(y~x1+x2+x3, data=df[-test.ind, ], hidden=c(nHidden),
linear.output=TRUE, stepmax=1e6)
toc()
# Train caret model
set.seed(0)
tic("nnet training")
nnetModel <- nnet(y~x1+x2+x3, data=df[-test.ind, ], size=nHidden, trace=F,
linout=TRUE)
toc()
# Check response VS targets on training data:
par(mfrow=c(2,2))
plot(train.y, compute(nnModel, train.x)$net.result,
main="neuralnet Train Fitting Fake Data", xlab="Target", ylab="Response")
abline(0,1, col="red")
# Plot linear model performance for reference
linearModel <- linearModel <- lm(y~., df[-test.ind, ])
points(train.y, predict(linearModel, data.frame(train.x)), col="green")
plot(train.y, predict(nnetModel, train.x),
main="nnet Train Fitting Fake Data", xlab="Target", ylab="Response")
abline(0,1, col="red")
plot(train.y, predict(mxModel1, train.x, array.layout=array.layout),
main="MXNET (no hidden) Train Fitting Fake Data", xlab="Target",
ylab="Response")
abline(0,1, col="red")
plot(train.y, predict(mxModel2, train.x, array.layout=array.layout),
main="MXNET (with hidden) Train Fitting Fake Data", xlab="Target",
ylab="Response")
abline(0,1, col="red")
# Create and print table of results:
results <- list()
rmse <- function(target, response) {
return(sqrt(mean((target - response)^2)))
}
results$mxModel1 <- rmse(train.y, predict(mxModel1, train.x,
array.layout=array.layout))
results$mxModel2 <- rmse(train.y, predict(mxModel2, train.x,
array.layout=array.layout))
results$nnet <- rmse(train.y, predict(nnetModel, train.x))
results$neuralnet <- rmse(train.y, compute(nnModel, train.x)$net.result)
results$linearModel <- rmse(train.y, predict(linearModel, data.frame(train.x)))
print(results)