Search code examples
rmachine-learningr-caretconfusion-matrix

Error in table(data, reference, dnn = dnn, ...) : all arguments must have the same length when run confusionMatrix with caret, in R


I have an issue running a confusionMatrix.

here is what I do:

rf <- caret::train(tested ~., 
                               data = training_data, 
                               method = "rf",
                               trControl = ctrlInside,
                               metric = "ROC", 
                               na.action = na.exclude)

rf

After I get my model this is the next step I take:

evalResult.rf <- predict(rf, testing_data, type = "prob")
predict_rf <- as.factor(ifelse(evalResult.rf <0.5, "positive", "negative"))

And then I am running my confusion matrix.

cm_rf_forest <- confusionMatrix(predict_rf, testing_data$tested, "positive") 

And the error comes after I apply the confusionMatrix:

Error in table(data, reference, dnn = dnn, ...) : 
  all arguments must have the same length

Nevertheless, I give you bits of my data.

train data:

structure(list(tested = structure(c(1L, 1L, 1L, 1L, 1L, 
1L), .Label = c("negative", "positive"), class = "factor"), Gender = structure(c(2L, 
2L, 1L, 1L, 2L, 2L), .Label = c("Female", "Male", "Other"), class = "factor"), 
    Age = c(63, 23, 28, 40, 31, 60), number_days_symptoms = c(1, 
    1, 16, 1, 14, 1), care_home_worker = structure(c(1L, 2L, 
    1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    health_care_worker = structure(c(1L, 1L, 1L, 1L, 2L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), how_unwell = c(1, 1, 6, 4, 2, 
    1), self_diagnosis = structure(c(1L, 1L, 2L, 1L, 2L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), chills = structure(c(1L, 1L, 2L, 
    1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    cough = structure(c(1L, 1L, 2L, 2L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), diarrhoea = structure(c(1L, 1L, 
    1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    fatigue = structure(c(1L, 2L, 2L, 2L, 2L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), headache = structure(c(2L, 2L, 
    3L, 2L, 2L, 2L), .Label = c("Headcahe", "No", "Yes"), class = "factor"), 
    loss_smell_taste = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), muscle_ache = structure(c(1L, 
    1L, 2L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"), 
    nasal_congestion = structure(c(1L, 1L, 1L, 2L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), nausea_vomiting = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    shortness_breath = structure(c(1L, 1L, 1L, 1L, 2L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), sore_throat = structure(c(1L, 
    1L, 1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    sputum = structure(c(1L, 1L, 2L, 2L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), temperature = structure(c(4L, 
    4L, 4L, 4L, 1L, 4L), .Label = c("37.5-38", "38.1-39", "39.1-41", 
    "No"), class = "factor"), asthma = structure(c(2L, 1L, 1L, 
    1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    diabetes_type_one = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), diabetes_type_two = structure(c(2L, 
    1L, 1L, 1L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"), 
    obesity = structure(c(1L, 2L, 2L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), hypertension = structure(c(1L, 
    1L, 2L, 1L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"), 
    heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 2L), .Label = c("No", 
    "Yes"), class = "factor"), lung_condition = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), kidney_disease = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor")), row.names = c(1L, 
3L, 4L, 5L, 6L, 7L), class = "data.frame")

and here is my test_data:

structure(list(tested = structure(c(1L, 1L, 1L, 1L, 1L, 
1L), .Label = c("negative", "positive"), class = "factor"), Gender = structure(c(1L, 
2L, 1L, 1L, 1L, 2L), .Label = c("Female", "Male", "Other"), class = "factor"), 
    Age = c(19, 26, 30, 45, 40, 43), number_days_symptoms = c(20, 
    1, 1, 20, 14, 1), care_home_worker = structure(c(1L, 1L, 
    1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    health_care_worker = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), how_unwell = c(7, 6, 6, 6, 6, 
    2), self_diagnosis = structure(c(2L, 1L, 1L, 2L, 2L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), chills = structure(c(2L, 1L, 1L, 
    1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    cough = structure(c(2L, 1L, 1L, 2L, 2L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), diarrhoea = structure(c(2L, 1L, 
    1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    fatigue = structure(c(2L, 1L, 1L, 2L, 2L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), headache = structure(c(2L, 2L, 
    2L, 3L, 2L, 3L), .Label = c("Headcahe", "No", "Yes"), class = "factor"), 
    loss_smell_taste = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), muscle_ache = structure(c(2L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    nasal_congestion = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), nausea_vomiting = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    shortness_breath = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), sore_throat = structure(c(1L, 
    1L, 1L, 2L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"), 
    sputum = structure(c(2L, 1L, 1L, 2L, 1L, 2L), .Label = c("No", 
    "Yes"), class = "factor"), temperature = structure(c(4L, 
    4L, 4L, 1L, 1L, 4L), .Label = c("37.5-38", "38.1-39", "39.1-41", 
    "No"), class = "factor"), asthma = structure(c(1L, 1L, 1L, 
    1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    diabetes_type_one = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), diabetes_type_two = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    obesity = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), hypertension = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), lung_condition = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), kidney_disease = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor")), row.names = c(2L, 
8L, 11L, 14L, 20L, 27L), class = "data.frame")

Additionally, I perform a smote balancing class, on a subsample in ctrInside.

This is my smote function:

smotest <- list(name = "SMOTE with more neighbors!",
                func = function (x, y) {
                  115
                  library(DMwR)
                  dat <- if (is.data.frame(x)) x else as.data.frame(x)
                  dat$.y <- y
                  dat <- SMOTE(.y ~ ., data = dat, k = 3, perc.over = 100, perc.under =
                                 200)
                  list(x = dat[, !grepl(".y", colnames(dat), fixed = TRUE)],
                       y = dat$.y) },
                first = TRUE)

And ctrlInside is this:

ctrlInside <- trainControl(method = "repeatedcv", 
                           number = 10,
                           repeats = 5,
                           summaryFunction = twoClassSummary,
                           classProbs = TRUE,
                           savePredictions = TRUE, 
                           search = "grid",
                           sampling = smotest)

Those function are given just so that you have an idea of what I am doing per whole. Is there a reason why this is happening?


Solution

  • You can use complete.cases to predict only those that have no nas, also you must operate on the matrix, I will show below. Using an example dataset, I make 10 of the variable in a column NAs, and train:

    idx = sample(nrow(iris),100)
    data = iris
    data$Petal.Length[sample(nrow(data),10)] = NA
    data$tested = factor(ifelse(data$Species=="versicolor","positive","negative"))
    data = data[,-5]
    training_data = data[idx,]
    testing_data= data[-idx,]
    
    rf <- caret::train(tested ~., data = training_data, 
                                  method = "rf",
                                  trControl = ctrlInside,
                                  metric = "ROC", 
                                  na.action = na.exclude)
    

    Do the evaluation result and you can see i get the same error:

    evalResult.rf <- predict(rf, testing_data, type = "prob")
    predict_rf <- as.factor(ifelse(evalResult.rf <0.5, "positive", "negative"))
    cm_rf_forest <- confusionMatrix(predict_rf, testing_data$tested, "positive") 
    
    Error in table(data, reference, dnn = dnn, ...) : 
      all arguments must have the same length
    

    So there's two sources of error, 1.. you have NAs and they cannot predict that, and second, evalResult.rf returns a matrix of probabilities, first column is probability being negative class, 2nd being postive:

    head(evalResult.rf)
       negative positive
    3     1.000    0.000
    6     1.000    0.000
    9     0.948    0.052
    12    1.000    0.000
    13    0.976    0.024
    19    0.998    0.002
    

    To get the classes, you do, get the column with max value for each row, and return the corresponding column name, which is the class:

    colnames(evalResult.rf)[max.col(evalResult.rf)]
    

    We do now:

    testing_data = testing_data[complete.cases(testing_data),]
    evalResult.rf <- predict(rf, testing_data, type = "prob")
    predict_rf <- factor(colnames(evalResult.rf)[max.col(evalResult.rf)])
    cm_rf_forest <- confusionMatrix(predict_rf, testing_data$tested, "positive")
    
    Confusion Matrix and Statistics
    
              Reference
    Prediction negative positive
      negative       33        1
      positive        0       11
    
                   Accuracy : 0.9778          
                     95% CI : (0.8823, 0.9994)
        No Information Rate : 0.7333          
        P-Value [Acc > NIR] : 1.507e-05       
    
                      Kappa : 0.9416