Search code examples
rmachine-learningneural-networkcross-validationforecasting

How to solve "Error in FUN(X[[i]], ...) : only defined on a data frame with all numeric variables" in cross validation


When I am doing neural nets, I want to use cross-validation to determine the number of neurons used in a layer. Here is the web page I referred: https://www.r-bloggers.com/selecting-the-number-of-neurons-in-the-hidden-layer-of-a-neural-network/

My code is as below:

max.train = apply(train[,-c(1,2)], 2 , max)
min.train = apply(train[,-c(1,2)], 2 , min)
trainNN = as.data.frame(scale(train[,-c(1,2)], center = min.train, scale = max.train - min.train))
max.test = apply(test[,-c(1,2)], 2 , max)
min.test = apply(test[,-c(1,2)], 2 , min)
testNN = as.data.frame(scale(test[,-c(1,2)], center = min.test, scale = max.test - min.test))
maxs <- apply(corn[,-c(1,2)], 2, max) 
mins <- apply(corn[,-c(1,2)], 2, min)
scaled = as.data.frame(scale(corn[,-c(1,2)], center = mins, scale = maxs - mins))

crossvalidate <- function(data,hidden_l=c(5))
{
  cv.error <- NULL
  k <- 10

  for(j in 1:k)
  {
    nn <- neuralnet(lprice ~ volume+open_interest, data=trainNN, hidden=hidden_l, linear.output=T)
    pr.nn <- compute(nn, testNN[,1:2])
    pr.nn <- pr.nn$net.result*(max(data$lprice)-min(data$lprice))+min(data$lprice)
    test.cv.r <- (testNN$lprice)*(max(data$lprice)-min(data$lprice))+min(data$lprice)
    cv.error[j] <- sum((test.cv.r - pr.nn)^2)/nrow(testNN)
  }
  return(mean(cv.error))
}

test.error <- NULL
train.error <- NULL
pbar <- create_progress_bar('text')
pbar$init(5)

set.seed(100)
for(i in 1:5)
{
  # Fit the net and calculate training error (point estimate)
  nn <- neuralnet(lprice ~ volume + open_interest, data=scaled, hidden=c(i), linear.output=T)
  train.error[i] <- sum(((as.data.frame(nn$net.result)*(7.880 - 7.129) + 7.129) - (scaled$lprice*(7.880 - 7.129) + 7.129))^2)/nrow(scaled)

  # Calculate test error through cross validation
  test.error[i] <- crossvalidate(corn, hidden_l=c(i))

  # Step bar
  pbar$step()
}

The difference between his code and mine is that I only have two predictors: volume and open interest. I need to predict the corn price in 2018 so I just took the data in 2018 as test data and the data before 2018 as training data. I divided the total dataset before the loop.

The data looks like this and the testNN and scaled dataset are similar. There is no NA in the three datasets and the original corn dataset and the three variables are all numeric.

 - head(trainNN)
 - volume   open_interest lprice
 - 1 0.007069       0.03093   0.4043
 - 2 0.011904       0.03133   0.4921 
 - 3 0.011351       0.03193   0.4691

summary(trainNN)
     volume       open_interest        lprice     
 Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
 1st Qu.:0.0000   1st Qu.:0.0003   1st Qu.:0.346  
 Median :0.0003   Median :0.0057   Median :0.516  
 Mean   :0.0144   Mean   :0.0462   Mean   :0.550  
 3rd Qu.:0.0035   3rd Qu.:0.0423   3rd Qu.:0.829  
 Max.   :1.0000   Max.   :1.0000   Max.   :1.000   

But the loop does not work well and keep popping the error

Error in FUN(X[[i]], ...) : only defined on a data frame with all numeric variables

In addition: Warning message: Algorithm did not converge in 1 of 1 repetition(s) within the stepmax.

Why does the problem appear and how to solve it?

Here are the details of my datasets:

dput(head(trainNN, 50))
structure(list(volume = c(0.00120479469333848, 0.000129847003131168, 
0.000733635567691098, 7.41982875035244e-06, 0.000944173208482348, 
0.00105083324676866, 0.0157161247718403, 3.70991437517622e-06, 
4.17365367207325e-05, 4.63739296897028e-06, 1.57671360944989e-05, 
0.000153961446569813, 8.81104664104353e-05, 4.17365367207325e-05, 
0.00331944588718892, 0.0045400077166219, 0.00387500556487156, 
0.000345949515485183, 0.000164163711101548, 0.00201819342009586, 
0.00236043302120587, 0.01444733405553, 0.00250604716043154, 2.31869648448514e-05, 
4.54464510959087e-05, 0, 9.27478593794055e-07, 0.00279356552450769, 
0.00255242109012124, 0.00196347218306201, 1.02022645317346e-05, 
0.00217957469541603, 0.00106845534005075, 4.08090581269384e-05, 
0.000232797127042308, 7.79082018787006e-05, 0.00519944499680947, 
0.00792437710537641, 0.00630963687358096, 0.000309777850327214, 
4.35914939083206e-05, 0.00141440485553593, 0.00513637645243148, 
0.125604716043154, 1.85495718758811e-05, 2.87518364076157e-05, 
8.3473073441465e-06, 0.000630685443779958, 0.0135903438348643, 
0.000255056613293365), open_interest = c(0.0197915558864192, 
0.00123364883547804, 0.0172950122301393, 2.65872593853026e-06, 
0.0139383707327449, 0.0200973093693502, 0.119931138998192, 0.00937466765925768, 
0.00381261299585239, 0.0039495373816867, 0.00020738062320536, 
0.000312400297777305, 0.0220022865043071, 0.000390832712963948, 
0.0272785281293204, 0.101016962671488, 0.0195509411889822, 0.00858901414442199, 
0.00559794746357545, 0.0130822609805381, 0.0132510900776348, 
0.215476443688185, 0.000724502818249495, 0.000316388386685101, 
0.000623471232585345, 5.58332447091354e-05, 5.45038817398703e-05, 
0.0573593533978517, 0.0156466021482506, 0.0447742741678188, 0.000305753482930979, 
0.00769036477719877, 0.0141523981707966, 0.000405455705625864, 
0.016496065085611, 0, 0.0187905455705626, 0.0804849516111879, 
0.00265739657556099, 0.012168988620653, 0.000519780920982665, 
0.0012389662873551, 0.0188370732744869, 0.291452196107625, 0.000623471232585345, 
0.00173348931192173, 0.000214027438051686, 0.0177616186323514, 
0.133994469850048, 0.00653116026799957), lprice = c(0.510582301463774, 
0.344204416537943, 0.851462133609609, 0.340903299172643, 0.895773917944989, 
0.356511250391288, 0.847513792278632, 0.31672235023017, 0.652594661043185, 
0.412485880130917, 0.806208151506684, 0.688082354441166, 0.346674896705426, 
0.868252274097258, 0.933373193480856, 0.859883659081866, 0.0987118318873677, 
0.648009457977949, 0.187832453779507, 0.383192570887849, 0.332614534559063, 
0.885931507335992, 0.688718913676761, 0.520006015479584, 0.372745457136178, 
0.552826105439045, 0.588978642534753, 0.350782208548046, 0.77436599354315, 
0.854837135858998, 0.74296653001429, 0.367085963811137, 0.909863554831568, 
0.294663244451221, 0.333445735943372, 0.420293640910594, 0.0957352939445262, 
0.367085963811137, 0.674643852907022, 0.301489653627575, 0.681060051128024, 
0.667550153291217, 0.946548376814812, 0.478927864832375, 0.373551996776401, 
0.845818056626141, 0.804458971773855, 0.845252331336205, 0.401477551563487, 
0.250302625671116)), row.names = c(NA, 50L), class = "data.frame")

dput(head(testNN, 50))
structure(list(volume = c(0.00706941956640145, 0.0119040265931462, 
0.0113509385651156, 0.0120303664980263, 0.0186702303878354, 0.0126424131483343, 
0.0311834960778478, 0.0338506718475386, 0.0241561898130731, 0.0330027907081211, 
0.026809327815555, 0.0260035599777642, 0.0319892639156386, 0.0265089195972845, 
0.0357401553138564, 0.0365683835791814, 0.035324637404473, 0.049780730076197, 
0.0364532738880685, 0.0490311133072418, 0.0276179032067875, 0.0210566508133482, 
0.00746247704825061, 0.00181648707683151, 0.00219831434491356, 
0.000401480142174506, 0.00295073866731053, 0.000188106080599244, 
0, 0.000126339904880089, 0.0021084726347766, 0.000345329073338911, 
0.000188106080599244, 0.0019287892145027, 0.0135520604634709, 
0.00230500137570119, 0.00889432930355829, 0.553716920001572, 
0.839989106692646, 0.951370366834933, 0.581725073136767, 0.638128821782123, 
0.751267610378964, 0.441645001712608, 0.442015598766923, 0.320864052647242, 
0.251806660639785, 0.470905323682836, 0.315720614741902, 0.344082519610761
), open_interest = c(0.0309283824075963, 0.0313335699255852, 
0.0319315708142034, 0.0374477098522044, 0.0445175334419422, 0.0438049622896169, 
0.0440704299738165, 0.0444728230951296, 0.0423099600680715, 0.0331080118147091, 
0.0335383489027801, 0.0305092229062284, 0.0273655266459695, 0.0261220201252449, 
0.0255910847568456, 0.0254457761297047, 0.0257056550205528, 0.0256888886404981, 
0.0256721222604434, 0.0255715239801151, 0.0259739171014282, 0.0258817020111273, 
0.025663739070416, 0.0238865027846163, 0.0170486141189686, 0.0170402309289413, 
0.0153188825766573, 0.00794446974925879, 0.000502991401641429, 
0, 0, 0, 0, 0, 0, 0, 0, 0.930472616309776, 0.877066107042159, 
0.811900776562836, 0.775534498224161, 0.769792013055421, 0.714309267057696, 
0.628929271025739, 0.514769783629865, 0.481555584741476, 0.455000433131485, 
0.436155021949986, 0.415213813261648, 0.364766570073688), lprice = c(0.404330074913638, 
0.492083022366336, 0.469079773268984, 0.619738346603267, 0.60162141297546, 
0.610684813595797, 0.60615434808141, 0.510439915573559, 0.542467292012032, 
0.597086005584079, 0.588007763147557, 0.556155639540664, 0.528756375062575, 
0.469079773268984, 0.487487460356058, 0.413610956731874, 0.418247516230837, 
0.455247217965267, 0.446012714799772, 0.422881491968453, 0.413610956731874, 
0.408971810588703, 0.446012714799772, 0.446012714799772, 0.455247217965267, 
0.455247217965267, 0.487487460356058, 0.459860626819221, 0.469079773268984, 
0.459860626819221, 0.455247217965267, 0.44139161479424, 0.469079773268984, 
0.473685516530432, 0.578919599132632, 0.588007763147557, 0.727618812006047, 
0.574371789751591, 0.628782033460881, 0.758822110329113, 0.794340239055759, 
0.705258838205934, 0.646839954229117, 0.592548123209006, 0.537899497749437, 
0.551595360157961, 0.528756375062575, 0.473685516530432, 0.551595360157961, 
0.510439915573559)), row.names = c(NA, 50L), class = "data.frame")

dput(head(scaled, 50))
structure(list(volume = c(0.000115934824224257, 0.00039788831673765, 
2.13320076572633e-05, 8.3473073441465e-06, 1.57671360944989e-05, 
8.3473073441465e-06, 0.000161381275320166, 3.70991437517622e-06, 
9.83127309421699e-05, 2.78243578138217e-05, 7.41982875035244e-06, 
0.000156743882351195, 1.39121789069108e-05, 0.000120572217193227, 
0.000115934824224257, 0.000556487156276433, 5.19388012524671e-05, 
0.000217957469541603, 0.000238361998605072, 0.000139121789069108, 
0.000301430542983068, 4.82288868772909e-05, 4.63739296897028e-05, 
2.22594862510573e-05, 6.86334159407601e-05, 0.000283808449700981, 
0.000370991437517622, 7.14158517221423e-05, 0.000189205633133987, 
0.000209610162197456, 0.000160453796726372, 8.81104664104353e-05, 
1.85495718758811e-05, 9.27478593794055e-06, 0.000182713282977429, 
0.00231962396307893, 0.00387500556487156, 0.00414119192129046, 
0.00555003190526363, 0.00519944499680947, 0.00395105880956268, 
0.00168059121195483, 0.000386758573612121, 0.00250511968183774, 
0.00169543086945553, 0.00119273747161916, 0.00142275216288008, 
0.00146541617819461, 0.000398815795331444, 0.000248564263136807
), open_interest = c(0.002306444751675, 0.0023024566627672, 0.00229713921089014, 
0.00229182175901308, 0.00228916303307455, 0.00228916303307455, 
0.00222934169945762, 0.00222934169945762, 0.00214293310645539, 
0.00211767521003935, 0.00210704030628523, 0.0020738062320536, 
0.00206848878017654, 0.00195416356481974, 0.0019023184090184, 
0.00178001701584601, 0.00174678294161438, 0.00166037434861215, 
0.00171487823035202, 0.00169094969690524, 0.00157396575560991, 
0.00157396575560991, 0.00156731894076359, 0.00157130702967138, 
0.00154870785919387, 0.00155003722216314, 0.0011778155907689, 
0.00114458151653728, 0.0010063277677337, 0.00113793470169095, 
0.00118845049452302, 0.00114192279059875, 0.00112597043496756, 
0.00112198234605977, 0.000962458789747953, 0.0208909390620015, 
0.0195509411889822, 0.019484473040519, 0.019354195469531, 0.0187905455705626, 
0.0171142188663193, 0.0163338828033606, 0.0161557481654791, 0.0155362650218016, 
0.0151215037753908, 0.0143584494310326, 0.0133880144634691, 0.0119376794640009, 
0.0117622035520579, 0.0116146442624694), lprice = c(0.0159141698893206, 
0.0085112121866148, 0.0116889477100056, 0.01063054465461, 0.00745028009139348, 
0.00745028009139348, 0.00957129992671682, 0.00638850229493165, 
0.00957129992671682, 0.00213290706124252, 0, 0.00638850229493165, 
0.00106688045361737, 0.0085112121866148, 0.00213290706124252, 
0.0243246172486233, 0.0232762109173386, 0.0337233246690091, 0.0807520068767474, 
0.067628746289212, 0.0358029506576186, 0.043056178402089, 0.0625467096965389, 
0.0502701234826566, 0.067628746289212, 0.0615279714632226, 0.0645818516892853, 
0.0574452091092523, 0.067628746289212, 0.0482129799461532, 0.0767278341025268, 
0.0767278341025268, 0.0543749038501801, 0.0553991254885964, 0.0666138889638279, 
0.0997025349268141, 0.0987118318873677, 0.105631328877444, 0.100692501657728, 
0.0957352939445262, 0.0817561533058562, 0.0827595433157385, 0.0747111794021993, 
0.0797471028870763, 0.0696561442074541, 0.0737017059358222, 0.0716804609726562, 
0.0615279714632226, 0.0605084534837667, 0.0594881545636145)), row.names = c(NA, 
50L), class = "data.frame")

Solution

  • You can use the following code

    library(neuralnet)
    scaled = dput(head(trainNN, 50))
    n <- names(scaled)
    f <- as.formula(paste("lprice ~", paste(n[!n %in% "lprice"], collapse = " + ")))
    
    set.seed(450)
    cv.error <- NULL
    k <- 10
    
    library(plyr) 
    pbar <- create_progress_bar('text')
    pbar$init(k)
    
    for(i in 1:k){
      index <- sample(1:nrow(scaled),round(0.9*nrow(scaled)))
      train.cv <- scaled[index,]
      test.cv <- scaled[-index,]
    
      nn <- neuralnet(f,data=train.cv,hidden=c(5),linear.output=T)
    
      pr.nn <- predict(nn,test.cv)
      pr.nn <- pr.nn*(max(scaled$lprice)-min(scaled$lprice))+min(scaled$lprice)
    
      test.cv.r <- (test.cv$lprice)*(max(scaled$lprice)-min(scaled$lprice))+min(scaled$lprice)
    
      cv.error[i] <- sum((test.cv.r - pr.nn)^2)/nrow(test.cv)
    
      pbar$step()
    }
    
    mean(cv.error)
    cv.error
    plot(nn)
    

    If you want to correct your code then see following

    library(plyr)
    library(neuralnet)
    scaled = dput(head(trainNN, 50))
    n <- names(scaled)
    f <- as.formula(paste("lprice ~", paste(n[!n %in% "lprice"], collapse = " + ")))
    
    crossvalidate <- function(data,hidden_l=c(5))
    {
      # Initialize cv.error vector
      cv.error <- NULL
    
      # Number of train-test splits
      k <- 10
    
      # Cross validating
      for(j in 1:k)
      {
        # Train-test split
        index <- sample(1:nrow(scaled),round(0.90*nrow(scaled)))
        train.cv <- scaled[index,]
        test.cv <- scaled[-index,]
    
        # NN fitting
        nn <- neuralnet(f,data=train.cv,hidden=hidden_l,linear.output=T)
    
        # Predicting
        pr.nn <- compute(nn,test.cv)
    
        # Scaling back the predicted results
        pr.nn <- pr.nn$net.result*(max(scaled$lprice)-min(scaled$lprice))+min(scaled$lprice)
    
        # Real results
        test.cv.r <- (test.cv$lprice)*(max(scaled$lprice)-min(scaled$lprice))+min(scaled$lprice)
    
        # Calculating MSE test error
        cv.error[j] <- sum((test.cv.r - pr.nn)^2)/nrow(test.cv)
      }
    
      # Return average MSE
      return(mean(cv.error))
    }
    
    n <- names(scaled)
    f <- as.formula(paste("lprice ~", paste(n[!n %in% "lprice"], collapse = " + ")))
    
    # Generate progress bar
    pbar <- create_progress_bar('text')
    pbar$init(13)
    
    set.seed(100)
    # Testing and Cross validating (may take a while to compute)
    for(i in 1:13)
    {
      # Fit the net and calculate training error (point estimate)
      nn <- neuralnet(f,data=scaled,hidden=c(i),linear.output=T)
      train.error[i] <- sum(((as.data.frame(nn$net.result)*(7.880 - 7.129) + 7.129) - (scaled$lprice*(7.880 - 7.129) + 7.129))^2)/nrow(scaled)
    
      # Calculate test error through cross validation
      test.error[i] <- crossvalidate(scaled,hidden_l=c(i))
    
      # Step bar
      pbar$step()
    }
    test.error
    train.error
    
    # Plot train error
    plot(train.error,main='MSE vs hidden neurons',xlab="Hidden neurons",ylab='Train error MSE',type='l',col='red',lwd=2)
    # Plot test error
    plot(test.error,main='MSE vs hidden neurons',xlab="Hidden neurons",ylab='Test error MSE',type='l',col='blue',lwd=2)