Poor performance of mxnet LinearRegressionOutput

I have been unable to get reasonable performance using mxnet LinearRegressionOutput layer.

The self-contained example below attempts to perform regression of a simple polynomial function (y = x1 + x2^2 + x3^3) with a small amount of random noise thrown in.

The mxnet regression example given here is used, along with a slightly more complex network which includes a hidden layer.

The example below also trains regression networks using the neuralnet and nnet packages, which as can be seen from the plots perform much better.

I realize the answer to poorly performing networks is to do some hyper-parameter tuning, however I have tried a range of values without any improvement in performance. So I have the following questions:

Do I have an error in my mxnet regression implementation?
Does anyone have experience that could help me in getting reasonable performance from mxnet for a simple regression problem like the one considered here?
Does anyone else have a mxnet regression example with good performance?

My set-up as follows:

MXNet version: 0.7
R `sessionInfo()`: R version 3.3.2 (2016-10-31)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1

Poor regression results of mxnet:

From this reproducible example:

## SIMPLE REGRESSION PROBLEM
# Check mxnet out-of-the-box performance VS neuralnet, and caret/nnet

library(mxnet)
library(neuralnet)
library(nnet)
library(caret)
library(tictoc)
library(reshape)

# Data definitions
nObservations <- 1000
noiseLvl <- 0.1

# Network config
nHidden <- 3
learnRate <- 2e-6
momentum <- 0.9
batchSize <- 20
nRound <- 1000
verbose <- FALSE
array.layout = "rowmajor"

# GENERATE DATA:
df <- data.frame(x1=runif(nObservations),
                 x2=runif(nObservations),
                 x3=runif(nObservations))

df$y <- df$x1 + df$x2^2 + df$x3^3 + noiseLvl*runif(nObservations)
# normalize data columns
# df <- scale(df)

# Seperate data into train/test
test.ind = seq(1, nObservations, 10)    # 1 in 10 samples for testing
train.x = data.matrix(df[-test.ind, -which(colnames(df) %in% c("y"))])
train.y = df[-test.ind, "y"]
test.x = data.matrix(df[test.ind, -which(colnames(df) %in% c("y"))])
test.y = df[test.ind, "y"]

# Define mxnet network, following 5-minute regression example from here:
# http://mxnet-tqchen.readthedocs.io/en/latest//packages/r/fiveMinutesNeuralNetwork.html#regression
data <- mx.symbol.Variable("data")
label <- mx.symbol.Variable("label")
fc1 <- mx.symbol.FullyConnected(data, num_hidden=1, name="fc1")
lro1 <- mx.symbol.LinearRegressionOutput(data=fc1, label=label, name="lro")

# Train MXNET model
mx.set.seed(0)
tic("mxnet training 1")
mxModel1 <- mx.model.FeedForward.create(lro1, X=train.x, y=train.y,
                                        eval.data=list(data=test.x, label=test.y),
                                        ctx=mx.cpu(), num.round=nRound,
                                        array.batch.size=batchSize,
                                        learning.rate=learnRate, momentum=momentum,
                                        eval.metric=mx.metric.rmse,
                                        verbose=FALSE, array.layout=array.layout)
toc()

# Train network with a hidden layer
fc1 <- mx.symbol.FullyConnected(data, num_hidden=nHidden, name="fc1")
tanh1 <- mx.symbol.Activation(fc1, act_type="tanh", name="tanh1")
fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden=1, name="fc2")
lro2 <- mx.symbol.LinearRegressionOutput(data=fc2, label=label, name="lro")
tic("mxnet training 2")
mxModel2 <- mx.model.FeedForward.create(lro2, X=train.x, y=train.y,
                                        eval.data=list(data=test.x, label=test.y),
                                        ctx=mx.cpu(), num.round=nRound,
                                        array.batch.size=batchSize,
                                        learning.rate=learnRate, momentum=momentum,
                                        eval.metric=mx.metric.rmse,
                                        verbose=FALSE, array.layout=array.layout)
toc()

# Train neuralnet model
mx.set.seed(0)
tic("neuralnet training")
nnModel <- neuralnet(y~x1+x2+x3, data=df[-test.ind, ], hidden=c(nHidden),
                     linear.output=TRUE, stepmax=1e6)
toc()

# Train caret model
mx.set.seed(0)
tic("nnet training")
nnetModel <- nnet(y~x1+x2+x3, data=df[-test.ind, ], size=nHidden, trace=F,
                   linout=TRUE)
toc()

# Check response VS targets on training data:
par(mfrow=c(2,2))
plot(train.y, compute(nnModel, train.x)$net.result, 
     main="neuralnet Train Fitting Fake Data", xlab="Target", ylab="Response")
abline(0,1, col="red")

plot(train.y, predict(nnetModel, train.x), 
     main="nnet Train Fitting Fake Data", xlab="Target", ylab="Response")
abline(0,1, col="red")

plot(train.y, predict(mxModel1, train.x, array.layout=array.layout), 
     main="MXNET (no hidden) Train Fitting Fake Data", xlab="Target",
     ylab="Response")
abline(0,1, col="red")

plot(train.y, predict(mxModel2, train.x, array.layout=array.layout),
     main="MXNET (with hidden) Train Fitting Fake Data", xlab="Target",
     ylab="Response")
abline(0,1, col="red")

Solution

I asked the same question in the mxnet github (link), and uzhao was kind enough to suggest using a different optimization approach, so credit goes to them.

Using the "rmsprop" optimizer, as well as increasing the batch size enabled mxnet to deliver performance comparable to the neuralnet and nnet tools on this simple regression task. I have also included the performance of a linear lm regression as well.

Results and self-contained example code included below. I hope this is of help to someone else (or myself in the future).

Root-mean-square errors of the 5 models:

$mxModel1
[1] 0.1404579862

$mxModel2
[1] 0.03263213499

$nnet
[1] 0.03222651138

$neuralnet
[1] 0.03054112057

$linearModel
[1] 0.1404421006

Plots showing the good/reasonable performance of mxnet regression (and linear regression results in green):

And finally the code for this self-contained example:

## SIMPLE REGRESSION PROBLEM
# Check mxnet out-of-the-box performance VS neuralnet, and caret/nnet

library(mxnet)
library(neuralnet)
library(nnet)
library(caret)
library(tictoc)
library(reshape)

# Data definitions
nObservations <- 1000
noiseLvl <- 0.1

# Network config
nHidden <- 3
batchSize <- 100
nRound <- 400
verbose <- FALSE
array.layout = "rowmajor"
optimizer <- "rmsprop"

# GENERATE DATA:
set.seed(0)
df <- data.frame(x1=runif(nObservations),
                 x2=runif(nObservations),
                 x3=runif(nObservations))

df$y <- df$x1 + df$x2^2 + df$x3^3 + noiseLvl*runif(nObservations)
# normalize data columns
# df <- scale(df)

# Seperate data into train/test
test.ind = seq(1, nObservations, 10)    # 1 in 10 samples for testing
train.x = data.matrix(df[-test.ind, -which(colnames(df) %in% c("y"))])
train.y = df[-test.ind, "y"]
test.x = data.matrix(df[test.ind, -which(colnames(df) %in% c("y"))])
test.y = df[test.ind, "y"]

# Define mxnet network, following 5-minute regression example from here:
# http://mxnet-tqchen.readthedocs.io/en/latest//packages/r/fiveMinutesNeuralNetwork.html#regression
data <- mx.symbol.Variable("data")
label <- mx.symbol.Variable("label")
fc1 <- mx.symbol.FullyConnected(data, num_hidden=1, name="fc1")
lro1 <- mx.symbol.LinearRegressionOutput(data=fc1, label=label, name="lro")

# Train MXNET model
mx.set.seed(0)
tic("mxnet training 1")
mxModel1 <- mx.model.FeedForward.create(lro1, X=train.x, y=train.y,
                                        eval.data=list(data=test.x, label=test.y),
                                        ctx=mx.cpu(), num.round=nRound,
                                        array.batch.size=batchSize,
                                        eval.metric=mx.metric.rmse,
                                        verbose=verbose,
                                        array.layout=array.layout,
                                        optimizer=optimizer
                                        )
toc()

# Train network with a hidden layer
fc1 <- mx.symbol.FullyConnected(data, num_hidden=nHidden, name="fc1")
tanh1 <- mx.symbol.Activation(fc1, act_type="tanh", name="tanh1")
fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden=1, name="fc2")
lro2 <- mx.symbol.LinearRegressionOutput(data=fc2, label=label, name="lro2")
tic("mxnet training 2")
mx.set.seed(0)
mxModel2 <- mx.model.FeedForward.create(lro2, X=train.x, y=train.y,
                                        eval.data=list(data=test.x, label=test.y),
                                        ctx=mx.cpu(), num.round=nRound,
                                        array.batch.size=batchSize,
                                        eval.metric=mx.metric.rmse,
                                        verbose=verbose,
                                        array.layout=array.layout,
                                        optimizer=optimizer
                                        )
toc()

# Train neuralnet model
set.seed(0)
tic("neuralnet training")
nnModel <- neuralnet(y~x1+x2+x3, data=df[-test.ind, ], hidden=c(nHidden),
                     linear.output=TRUE, stepmax=1e6)
toc()
# Train caret model
set.seed(0)
tic("nnet training")
nnetModel <- nnet(y~x1+x2+x3, data=df[-test.ind, ], size=nHidden, trace=F,
                   linout=TRUE)
toc()

# Check response VS targets on training data:
par(mfrow=c(2,2))
plot(train.y, compute(nnModel, train.x)$net.result, 
     main="neuralnet Train Fitting Fake Data", xlab="Target", ylab="Response")
abline(0,1, col="red")

# Plot linear model performance for reference
linearModel <- linearModel <- lm(y~., df[-test.ind, ])
points(train.y, predict(linearModel, data.frame(train.x)), col="green")

plot(train.y, predict(nnetModel, train.x), 
     main="nnet Train Fitting Fake Data", xlab="Target", ylab="Response")
abline(0,1, col="red")

plot(train.y, predict(mxModel1, train.x, array.layout=array.layout), 
     main="MXNET (no hidden) Train Fitting Fake Data", xlab="Target",
     ylab="Response")
abline(0,1, col="red")

plot(train.y, predict(mxModel2, train.x, array.layout=array.layout),
     main="MXNET (with hidden) Train Fitting Fake Data", xlab="Target",
     ylab="Response")
abline(0,1, col="red")

# Create and print table of results:
results <- list()
rmse <- function(target, response) {
  return(sqrt(mean((target - response)^2)))
}
results$mxModel1 <- rmse(train.y, predict(mxModel1, train.x,
                                          array.layout=array.layout))
results$mxModel2 <- rmse(train.y, predict(mxModel2, train.x,
                                          array.layout=array.layout))
results$nnet <- rmse(train.y, predict(nnetModel, train.x))
results$neuralnet <- rmse(train.y, compute(nnModel, train.x)$net.result)
results$linearModel <- rmse(train.y, predict(linearModel, data.frame(train.x)))

print(results)