Search code examples
rlistr-caretrpartconfusion-matrix

How to use a function to produce confusion matrices using the Caret package from nested subsets in a master-list


I want to incorporate the function confusionMatrix() in the caret packageinto the function shuffle100 to produce confusion matrices from subsets (dataframes) of a master-list produced from classification tree models. My aim is to produce confusion matrix statistics such as classification accuracy, kappa metric etc (desired output below). I am sorry to ask such a simple question but I cannot figure this out. If anyone can help, then many thanks in advance.

Reproducible dummy data can be found at this address:

Reproducible data

Code to produce a nested list of classification tree model predictions and confusion matrices

        library(caret)
        library(e1071)
        library(rpart)

        set.seed(1235)

       shuffle100 <-lapply(seq(10), function(n){ #produce 10 different shuffled data-frames
       subset <- my_data[sample(nrow(my_data), 80),] #shuffle 80 rows in the data-frame
       subset_idx <- sample(1:nrow(subset), replace = FALSE)
       subset <- subset[subset_idx, ] 
       subset_resampled_idx <- createDataPartition(subset_idx, times = 1, p = 0.7, list = FALSE) #partition data-frame into 70 % training and 30 % test subsets   
       subset_resampled <- subset[subset_resampled_idx, ] #70 % training data
       ct_mod<-rpart(Family~., data=subset_resampled, method="class", control=rpart.control(cp=0.005)) #10 ct models
       ct_pred<-predict(ct_mod, newdata=subset[,2:13])
       confusionMatrix(ct_pred, norm$Family)#10 confusion matrices
       })

Error messages

        Error in sort.list(y) : 'x' must be atomic for 'sort.list'
        Have you called 'sort' on a list?
        Called from: sort.list(y)

Desired outcome

                    Confusion Matrix and Statistics

                    Reference
         Prediction G8 V4
                 G8 42 12
                 V4  8 18

                Accuracy : 0.75            
                  95% CI : (0.6406, 0.8401)
     No Information Rate : 0.625           
     P-Value [Acc > NIR] : 0.01244         

                   Kappa : 0.4521          
  Mcnemar's Test P-Value : 0.50233         

             Sensitivity : 0.8400          
             Specificity : 0.6000          
          Pos Pred Value : 0.7778          
          Neg Pred Value : 0.6923          
              Prevalence : 0.6250          
          Detection Rate : 0.5250          
    Detection Prevalence : 0.6750          
       Balanced Accuracy : 0.7200          

        'Positive' Class : G8              

Solution

  • Here is a function to produce confusion matrices from sub-lists (dataframes) in a master-list produced from classification tree models using the function confusionMatrixin the caret package.

       #Generate three new column headings: 
       #(1) `Predicted'
       #(2) `Actual'
       #(3) `Binary'
    
     my_list <- lapply(shuffle100, function(df){#Create two new columns     Predicted and Actual
             if (nrow(df) > 0)
             cbind(df, Predicted = c(""), Actual = c(""), Binary = c(""),  Actual2 = c(""))
             else
             cbind(df, Predicted = factor(), Actual = c(""), Binary = c (""), Actual2 = c(""))
             })
    
      # Produce three columns filled with NA's
      #`Predicted' = NA
      #`Actual' = NA
      #`Binary' = NA
    
     Final_lists<-lapply(my_list, function(x) mutate(x, Predicted = NA, Actual = NA, Binary = NA, Actual2 = NA))
    
      #FILL THE PREDICTED COLUMN
    
      #Fill the `Predicted'depending on the condition of which group in the dependent variable has the highest probability: either V4 > G8 or G8 > V4
    
      #Fill the Predicted column
    
       for(i in 1:length(Final_lists)){
        for(j in 1:nrow(Final_lists[[i]])){
        Final_lists[[i]] [j,3]=names(Final_lists[[i]])[(Final_lists[[i]] [j,2] > Final_lists[[i]] [j,1])+1]
        }
       }           
    
     Final_lists
    
     #FILL THE ACTUAL COLUMN
    
     #Fill in the Actual column with the actual class predictions
     #Firstly create a vector for normalised_scores$Family
     #Insert normalised_scores$Family into the column called `Actual' for each sub-list in the nested sublist
    
      Actual <-lapply(Final_lists, `[`, 4) # Select the Actual column in all lists
      normalised_Actual<-normalised_scores$Family
      Actual<-normalised_Actual
    
      #There are two ways:
    
      #Way 1:
    
      # Use indices - and pass in Final_lists
    
       Actual_list <- lapply(seq_along(Final_lists), 
                      function(i, x){
                        x[[i]]$Actual <- Actual 
                        return (x[[i]])
                      }, Final_lists
                     )
    
      #FILL THE BINARY COLUMN
    
      # Use indices - and pass in Final_lists
    
      # iterate the ten elements of the outer list
      # iterate each row of EACH inner list
      # in each row, if Predicted==Actual, assign 1 to Binary, else 0
    
      #Method 1
    
       for( i in 1 : length(Actual_list)) {
        for( j in 1 : length(Actual_list[[i]]$Predicted)) {
        if(Actual_list[[i]][j,"Predicted"] == Actual_list[[i]][j,"Actual"]){
          Actual_list[[i]][j,"Binary"] <- 1
          } else {
          Actual_list[[i]][j,"Binary"] <- 0
        }
      }
    }
    
    
     #Fill in Actual2 column
    
      for( i in 1 : length(Actual_list)){
        for( j in 1 : length(Actual_list[[i]]$Actual)){
         if(Actual_list[[i]][j,"Actual"] == "V4"){
           Actual_list[[i]][j,"Actual2"] <- 1
        } else {
          Actual_list[[i]][j,"Actual2"] <- 0
        }
       }
      }
    
    Actual_list
    
    #Generate confusion matrices
    
       confusionMatrices <- lapply(Actual_list, function(scores){
    confusionMatrix(scores$Predicted, scores$Actual)
    })