Search code examples
rmachine-learningartificial-intelligencerandom-foresttraining-data

R: Check Variables in Training Data


I'm using training data provided to me in an RData file, and a data frame I built myself with all the columns I thought there were in the training data.

args = commandArgs(trailingOnly=TRUE)

model = readRDS(args[1])
m = model[[1]]

infile = fread(newDataPath, header=T)
setDF(infile)
i = infile[,!colnames(infile) %in% c("chr", "pos", "end")]

predictions = predict(m, i)

Running this, though, I get variables in the training data missing in newdata.

With colnames(i), I can find a list of the variables in newdata, but how can I do the same for the training data—which is, I think, an object of class randomForest?


Solution

  • You can use str to look at the structure of the model to find where the column names are.

    I'm assuming you are using the randomForest package, but it will be the same idea for other models.

    library('randomForest')
    
    model <- randomForest(Species ~ ., data = iris, ntree=5)
    
    str(model)
    #> List of 19
    #>  $ call           : language randomForest(formula = Species ~ ., data = iris, ntree = 5)
    #>  $ type           : chr "classification"
    #>  $ predicted      : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
    #>   ..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
    #>  $ err.rate       : num [1:5, 1:4] 0.0862 0.0753 0.114 0.0714 0.0833 ...
    #>   ..- attr(*, "dimnames")=List of 2
    #>   .. ..$ : NULL
    #>   .. ..$ : chr [1:4] "OOB" "setosa" "versicolor" "virginica"
    #>  $ confusion      : num [1:3, 1:4] 45 0 0 0 41 8 0 3 35 0 ...
    #>   ..- attr(*, "dimnames")=List of 2
    #>   .. ..$ : chr [1:3] "setosa" "versicolor" "virginica"
    #>   .. ..$ : chr [1:4] "setosa" "versicolor" "virginica" "class.error"
    #>  $ votes          : matrix [1:150, 1:3] 1 1 1 1 1 1 1 1 1 1 ...
    #>   ..- attr(*, "dimnames")=List of 2
    #>   .. ..$ : chr [1:150] "1" "2" "3" "4" ...
    #>   .. ..$ : chr [1:3] "setosa" "versicolor" "virginica"
    #>  $ oob.times      : num [1:150] 1 2 1 1 3 1 2 2 2 2 ...
    #>  $ classes        : chr [1:3] "setosa" "versicolor" "virginica"
    #>  $ importance     : num [1:4, 1] 20.53 4.33 19.17 55.25
    #>   ..- attr(*, "dimnames")=List of 2
    #>   .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
    #>   .. ..$ : chr "MeanDecreaseGini"
    #>  $ importanceSD   : NULL
    #>  $ localImportance: NULL
    #>  $ proximity      : NULL
    #>  $ ntree          : num 5
    #>  $ mtry           : num 2
    #>  $ forest         :List of 14
    #>   ..$ ndbigtree : int [1:5] 9 17 35 11 19
    #>   ..$ nodestatus: int [1:35, 1:5] 1 1 -1 -1 1 1 -1 -1 -1 0 ...
    #>   ..$ bestvar   : int [1:35, 1:5] 4 4 0 0 2 3 0 0 0 0 ...
    #>   ..$ treemap   : int [1:35, 1:2, 1:5] 2 4 0 0 6 8 0 0 0 0 ...
    #>   ..$ nodepred  : int [1:35, 1:5] 0 0 3 1 0 0 2 2 3 0 ...
    #>   ..$ xbestsplit: num [1:35, 1:5] 1.65 0.8 0 0 2.25 4.75 0 0 0 0 ...
    #>   ..$ pid       : num [1:3] 1 1 1
    #>   ..$ cutoff    : num [1:3] 0.333 0.333 0.333
    #>   ..$ ncat      : Named int [1:4] 1 1 1 1
    #>   .. ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
    #>   ..$ maxcat    : int 1
    #>   ..$ nrnodes   : int 35
    #>   ..$ ntree     : num 5
    #>   ..$ nclass    : int 3
    #>   ..$ xlevels   :List of 4
    #>   .. ..$ Sepal.Length: num 0
    #>   .. ..$ Sepal.Width : num 0
    #>   .. ..$ Petal.Length: num 0
    #>   .. ..$ Petal.Width : num 0
    #>  $ y              : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
    #>   ..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
    #>  $ test           : NULL
    #>  $ inbag          : NULL
    #>  $ terms          :Classes 'terms', 'formula'  language Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width
    #>   .. ..- attr(*, "variables")= language list(Species, Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)
    #>   .. ..- attr(*, "factors")= int [1:5, 1:4] 0 1 0 0 0 0 0 1 0 0 ...
    #>   .. .. ..- attr(*, "dimnames")=List of 2
    #>   .. .. .. ..$ : chr [1:5] "Species" "Sepal.Length" "Sepal.Width" "Petal.Length" ...
    #>   .. .. .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
    #>   .. ..- attr(*, "term.labels")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
    #>   .. ..- attr(*, "order")= int [1:4] 1 1 1 1
    #>   .. ..- attr(*, "intercept")= num 0
    #>   .. ..- attr(*, "response")= int 1
    #>   .. ..- attr(*, ".Environment")=<environment: 0x7f9bed91f8d8> 
    #>   .. ..- attr(*, "predvars")= language list(Species, Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)
    #>   .. ..- attr(*, "dataClasses")= Named chr [1:5] "factor" "numeric" "numeric" "numeric" ...
    #>   .. .. ..- attr(*, "names")= chr [1:5] "Species" "Sepal.Length" "Sepal.Width" "Petal.Length" ...
    #>  - attr(*, "class")= chr [1:2] "randomForest.formula" "randomForest"
    
    attr(model$terms, 'term.labels')
    #> [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"
    
    attr(model$terms, 'dataClasses')
    #>      Species Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
    #>     "factor"    "numeric"    "numeric"    "numeric"    "numeric"