Search code examples
rmachine-learningclassificationrocproc-r-package

How to Create Multiple ROC Curves on the Same Plot for Multi.roc() objects (3 classes) in the pROC Package in R


Issue:

I want to plot ROC curves from multi.roc() objects for 12 models (see below) that I have produced onto the same plot to compare them. All my models contain 3 classes, which makes this conundrum a bit more complicated. I tried to follow the code from a question that I previously asked here; however, I kept on getting error messages since that question is a binary problem and my data has three classes for the dependent variable 'Country' (3 categorical levels - France, Holland, and Spain)

I have tried using three different methods: see Attempt 1, Attempt 2, and Attempt 3, and digrams 1 and 2

With all attempts, I am not sure how to incorporate 12 models onto the same plot. Diagram 1 is obviously wrong as the AUC = 0.90

I've searched online through other StackOverflow questions and tutorials, but nothing entirely solves the whole problem. Any solutions that I have found so far, I cannot get their code to work properly. Any help is highly appreciated.

R-CODE: Train the models

#Open libraries
library(MASS)
library(caret)
library(e1071)
library(klaR)
library(gbm)
library(earth)
library(kernlab)
library(rpart)
library(randomForest)
library(mlbench)
library(adabag)
library(ada)
library(fastAdaboost)
library(xgboost)
library(C50)

##Produce a new version of the dataframe 'Clusters_Dummy' with the rows shuffled
NewClusters=Cluster_Dummy_2[sample(1:nrow(Cluster_Dummy_2)),]

#Produce a dataframe
NewCluster<-as.data.frame(NewClusters)

#display
print(NewCluster)

#Check the structure of the data
str(NewCluster)

#Number of rows
nrow(NewCluster)

#Split the data frame into 70% to 30% train and test data
training.parameters <- Cluster_Dummy_2$Country %>% 
createDataPartition(p = 0.7, list = FALSE)
train.data <- NewClusters[training.parameters, ]
test.data <- NewClusters[-training.parameters, ]
sapply(train.data, summary) # Look at a summary of the training data

####################################################
#FIT MODELS: Auxiliary function to train the models
###################################################

fitControl <- trainControl(## 10-fold CV
                          method = "repeatedcv",
                          number = 10,
                          ## repeated ten times
                          repeats = 10,
                          classProbs = TRUE,
                          verbose = TRUE)


tuneLength <- 10
metric <- "Accuracy"

#####
#LDA#
#####
#Train the model
lda.fit.CV = train(Country ~ ., data=train.data, method="lda",
                   trControl = fitControl, metric=metric, tuneLength = tuneLength)

lda.fit.CV

#####################################
# Stochastic Boosted Gradient Trees #
#####################################

gbmGrid <-  expand.grid(interaction.depth = c(1, 5, 9), 
                        n.trees = (1:30)*50, 
                        shrinkage = 0.1,
                        n.minobsinnode = 20)

#Stochastic Boosted Gradient Tree: model 1
gbmFit1 <- train(Country ~ ., data=train.data,   method = "gbm",  metric=metric, 
                 trControl = fitControl, tuneLength = tuneLength, verbose=FALSE)

gbmFit1

#Stochastic Boosted Gradient Tree: model 2
gbmFit2 <- train(Country ~ ., data=train.data, method = "gbm",  trControl = fitControl, 
                 metric=metric, tuneLength = tuneLength,  tuneGrid = gbmGrid, verbose=FALSE)

gbmFit2

#########################################################
# Multivariate Adaptive Regression Splines (MARS) model  
#########################################################

# Step 1: Define the tuneGrid
marsGrid <-  expand.grid(nprune = c(2, 4, 6, 8, 10), 
                         degree = c(1, 2, 3))

# Train the model using randomForest and predict on the training data itself.
model_mars = train(Country ~ .,  data=train.data,  method='earth',  metric='ROC', tuneGrid = marsGrid, trControl = fitControl, tuneLength = tuneLength)

model_mars

###############################
# Single Vector Machine (SVM) #
###############################

model_svmRadial = train(Country ~ ., data=train.data, method='svmRadial',, metric=metric, trControl = fitControl, tuneLength= tuneLength)

model_svmRadial

###############################################
# Recursive Partitioning Classification Trees #
###############################################

f <- as.formula(paste0("Country ~ ", paste0(names(train.data)[2:10], collapse = "+")))

rpart.ctrl <- rpart.control(minsplit = 5, minbucket = 5, cp = seq(0, 0.02, 0.0001))

dt.rpart <- train(form = f, data = train.data, method = "rpart", metric = metric, trControl = fitControl, tuneGrid = rpart.ctrl, tuneLength= tuneLength)
dt.rpart

##############
Naive Bayes
############

#Tune the model
nb_tune <- data.frame(usekernel = TRUE, fL = 0, adjust=seq(0, 5, by = 1))

#Train the model
model.nb = train(Country ~., data=train.data,'nb', trControl=fitControl, metric=metric, tuneLength=tuneLength, tuneGrid = nb_tune, laplace = 0:3)
model.nb

#################
# Random Forest #
#################

model_rf = train(Country ~., data=train.data, method='rf', metric=metric, tuneLength= tuneLength, trControl = fitControl)

model_rf

#####################################
# Regularized Discriminant Analysis #
#####################################

rdaGrid=data.frame(gamma = (0.4)/4, lambda = 3/4)

rdaFit <- train(Country ~ ., data =train.data, method = "rda", trControl = fitControl, tuneLength = tuneLength, metric = "ROC")
rdaFit    

#####################################
# Classification with Decision Tree #
#####################################

#Train the model
Decision_Fit <- train(Country ~ ., data =train.data, method = "C5.0", trControl = fitControl, tuneLength = tuneLength, metric = "ROC")
Decision_Fit 

##################################
# K-nearest neighbour classifier #
##################################

#Train the model
model_knn = caret::train(Country ~ ., data=train.data, method='knn', 
                         tuneLength = tuneLength, metric=metric, 
                         trControl = fitControl, 
                         tuneGrid = expand.grid(k = seq(1, 101, by = 2)))
model_knn

#############################
# Neural Network Classifier #
#############################

Neural_Fit <- train(Country ~ ., data =train.data, method = "nnet", trControl = fitControl, tuneLength = tuneLength, metric = "ROC")
Neural_Fit

Predict the models on the test data

#####
#LDA#
#####

pred_LDA = predict(lda.fit.CV, test.data, type="prob")
pred_LDA

######################################
# Stochastic Boosted Gradient Trees #
#####################################

## Stochastic Boosted Gradient Trees: model 1
#Predict the model with the test data
pred_model_Tree1 = predict(gbmFit1, newdata = test.data, type = "prob")
pred_model_Tree1

print(pred_model_Tree1)

## Stochastic Boosted Gradient Trees: model 1
pred_model_Tree1$evaluation <- names(pred_model_Tree1)[apply(pred_model_Tree1, 1, which.max)]
pred_model_Tree1$evaluation

table(pred_model_Tree1$evaluation)

#Now you can print the confusionMatrix (make sure each factor has the same levels)
confusionMatrix(factor(pred_model_Tree1$evaluation, levels = unique(test.data$Country)),
                factor(test.data$Country, levels = unique(test.data$Country)))

#Predict the model with the test data
pred_model_Tree2 = predict(gbmFit2, newdata = test.data, type = "prob")
pred_model_Tree2

print(pred_model_Tree2)

## Stochastic Boosted Gradient Trees: model 2
pred_model_Tree2$evaluation <- names(pred_model_Tree2)[apply(pred_model_Tree2, 1, which.max)]
pred_model_Tree2$evaluation

#########################################
# Bagged Flexible Discriminant Analysis #
########################################

#Predict the bagged flexible discriminate model with the test data
Earth_fitted <- predict(model_mars, newdata = test.data, type = "prob")
Earth_fitted

Earth_fitted$evaluation <- names(Earth_fitted)[apply(Earth_fitted, 1, which.max)]
Earth_fitted$evaluation

###############################
# Single Vector Machine (SVM) #
###############################

#Predict the random forest model with the test data
SVM_fitted <- predict(model_svmRadial, newdata = test.data, type = "prob")
SVM_fitted

#Evaluate the predictions
SVM_fitted$evaluation <- names(SVM_fitted)[apply(SVM_fitted, 1, which.max)]
SVM_fitted$evaluation

###############################################
# Recursive Partitioning Classification Trees #
##############################################

#Predict the random forest model with the test data
rpart_fitted <- predict(dt.rpart, newdata = test.data, type = "prob")
rpart_fitted

#produce a dataframe
rpart_fit<-as.data.frame(rpart_fitted)
rpart_fit

#Evaluate the predictions
rpart_fit$evaluation <- factor(max.col(rpart_fit[,1:3]), levels=1:3, labels = c("France", "Spain", "Holland"))
rpart_fit$evaluation 

###############
# Naïve Bayes #
#############

#Predict the model with probabilities
predict.nb<-predict(model.nb$finalModel, newdata = test.data, type = "prob")
predict.nb

#Predict the model with the classes
pedict.class.nb<-predict(model.nb$finalModel, newdata = test.data, type = "prob")$class
pedict.class.nb

#Unlist the results as the function table() and confusionMatrix do not recognize lists
unlist.predicted.nb.Country <-unlist(predict.nb$class)
unlist.predicted.nb.Country

unlist.predicted.nb.posterior <-unlist(predict.nb$posterior)
unlist.predicted.nb.posterior

#produce a dataframe
nb_fit<-as.data.frame(unlist.predicted.nb.posterior)
nb_fit

#Evaluate the predictions
nb_fit$evaluation <- factor(max.col(nb_fit[,1:3]), levels=1:3, labels=c(""France", "Spain", "Holland""))
nb_fit$evaluation 

#################
# Random Forest #
################

#Now that we have generated a classification model
#Model Evaluation
pred_rf<-predict(model_rf, newdata = test.data, type = "prob")
pred_rf

#Evaluate the predictions
pred_rf$evaluation <- factor(max.col(pred_rf[,1:3]), levels=1:3, labels = c("France", "Spain", "Holland""))
pred_rf$evaluation

#number of rows
nrow(pred_rf)

#####################################
# Regularized Discriminant Analysis #
#####################################

#Now that we have generated a classification model
#Model Evaluation
pred_rda<-predict(rdaFit, newdata = test.data, type = "prob")
pred_rda

#Evaluate the predictions
rdaFit $evaluation <- factor(max.col(pred_rda[,1:3]), levels=1:3, labels = c(""France", "Spain", "Holland""))
rdaFit $evaluation

#number of rows
nrow(pred_rf)

##################
# Decision Trees #
##################

#Now that we have generated a classification model
#Model Evaluation
pred_decision<-predict(Decision_Fit, newdata = test.data, type = "prob")
pred_decision

#Evaluate the predictions
pred_decision$evaluation <- factor(max.col(pred_decision[,1:3]), levels=1:3, labels = c(""France", "Spain", "Holland""))
pred_decision$evaluation

#########################
# knn nearest neighbor #
########################

#Now that we have generated a classification model
#Model Evaluation
pred_knn<-predict(model_knn, newdata = test.data, type = "prob")
pred_knn

#Evaluate the predictions
pred_knn$evaluation <- factor(max.col(pred_knn[,1:3]), levels=1:3, labels = c("France", "Spain", "Holland"))
pred_knn$evaluation

########################
# knn nearest neighbor #
########################

#Now that we have generated a classification model
#Model Evaluation
pred_net<-predict(Neural_Fit, newdata = test.data, type = "prob")
pred_net

#Evaluate the predictions
pred_net$evaluation <- factor(max.col(pred_net[,1:3]), levels=1:3, labels = c("France", "Spain", "Holland"))
pred_net$evaluation

#Contingency table of predictions
table(pred_net$evaluation)

#Now you can print the confusionMatrix (make sure each factor has the same levels)
confusionMatrix(factor(pred_net$evaluation, levels = unique(test.data$Country)),
                factor(test.data$Country, levels = unique(test.data$Country)))

Produce the mult.roc() ojbects

#LDA
roc_LDA <- multiclass.roc(test.data$Country, pred_LDA, levels=c("France", "Spain", "Holland"), auc=TRUE)
roc_LDA 

#Boosted Tree model 1
roc_Tree1 <- multiclass.roc(test.data$Country, pred_model_Tree1, levels=c("France", "Spain", "Holland"), auc=TRUE)
roc_Tree1 

#Boosted Tree model 2
roc_Tree2 <- multiclass.roc(test.data$Country, pred_model_Tree2, levels=c("France", "Spain", "Holland"), auc=TRUE)
roc_Tree2 

#Mars model
roc_Mars <- multiclass.roc(test.data$Country, Earth_fitted, levels=c("France", "Spain", "Holland"), auc=TRUE)
roc_Mars

#Single Vector Machine
roc_SVM <- multiclass.roc(test.data$Country, SVM_fitted, levels=c("France", "Spain", "Holland"), auc=TRUE)
roc_SVM

#Single Vector Machine
roc_RART <- multiclass.roc(test.data$Country, rpart_fitted, levels=c("France", "Spain", "Holland"), auc=TRUE)
roc_RART

#Naive Bayes
roc_nb <- multiclass.roc(test.data$Country, predict.nb$posterior, levels=c("France", "Spain", "Holland"), auc=TRUE)
roc_nb 

#Single Vector Machine
roc_RART <- multiclass.roc(test.data$Country, rpart_fitted, levels=c("France", "Spain", "Holland"))
roc_RART

#Random Forest
roc_RF <- multiclass.roc(test.data$Country, pred_rf, levels=c("France", "Spain", "Holland"), auc=TRUE)
roc_RF

#Regularized Discriminant Analysis
roc_RDA <- multiclass.roc(test.data$Country, pred_rda, levels=c("France", "Spain", "Holland"), auc=TRUE)
roc_RDA

#Decision Trees
roc_Decision <- multiclass.roc(test.data$Country, pred_decision, levels=c("France", "Spain", "Holland"), auc=TRUE)
roc_Decision

#K nearest neighbour
roc_knn <- multiclass.roc(test.data$Country, pred_knn, levels=c("France", "Spain", "Holland"), auc=TRUE)
roc_knn

#Neural network
roc_net <- multiclass.roc(test.data$Country, pred_net, levels=c("France", "Spain", "Holland"), auc = TRUE)
roc_net

This is my desired ROC plot

enter image description here

Attempts to solve the problem

#Attempt 1

dev.new()

lvls = levels(Cluster_Dummy_2$Country)

aucs = c()
plot(x=NA, y=NA, xlim=c(0,1), ylim=c(0,1),
     ylab='True Positive Rate',
     xlab='False Positive Rate',
     bty='n')

for (type.id in 1:3) {
  type = as.factor(test.data$Country == lvls[type.id])
  score = pred_LDA([, 'TRUE']], levels[,1:3])
  actual.class = test.data$Country == lvls[type.id]
  
  pred = prediction(score, actual.class)
  nbperf = performance(pred, "tpr", "fpr")
  
  roc.x = unlist([email protected])
  roc.y = unlist([email protected])
  lines(roc.y ~ roc.x, col=type.id+1, lwd=2)
  
  nbauc = performance(pred, "auc")
  nbauc = unlist(slot(nbauc, "y.values"))
  aucs[type.id] = nbauc
}

#Error 

Error: unexpected '}' in "}"

#However, this version did produce a roc curve for the LDA, although, I think it's obviusly wrong as the auc = 90 %

#Attempt 2

score_data <- data.frame(LDA=pred_LDA,
                         Country=test.data$Country,
                         stringsAsFactors=FALSE)

plot(roc(test.data$Country, score_data[,1:3] , direction="<"),
     col="red", lwd=3, main="The turtle finds its way")

#Setting levels: control = Italy, case = Turkey
Error in h(simpleError(msg, call)) : 
  error in evaluating the argument 'x' in selecting a method for function 'plot': Predictor must be numeric or ordered.
In addition: Warning message:
In roc.default(test.data$Country, score_data[, 1:3], direction = "<") :
  'response' has more than two levels. Consider setting 'levels' explicitly or using 'multiclass.roc' instead

Attempt 3:

rs <- roc_net[['rocs']]
print(rs)
plot.roc(rs[[1:2]])
sapply(2:length(rs),function(i) lines.roc(rs[[i]],col=i))

#Error
Error in roc.default(x, predictor, ...) : No valid data provided.
Called from: roc.default(x, predictor, ...)

Diagram_1

enter image description here

Diagram 2

enter image description here

Data

structure(list(Low.Freq = c(435L, 94103292L, 1L, 2688L, 8471L, 
    28818L, 654755585L, 468628164L, 342491L, 2288474L, 3915L, 411L, 
    267864894L, 3312618L, 5383L, 8989443L, 1894L, 534981L, 9544861L, 
    3437614L, 475386L, 7550764L, 48744L, 2317845L, 5126197L, 2445L, 
    8L, 557450L, 450259742L, 21006647L, 9L, 7234027L, 59L, 9L, 605L, 
    9199L, 3022L, 30218156L, 46423L, 38L, 88L, 396396244L, 28934316L, 
    7723L, 95688045L, 679354L, 716352L, 76289L, 332826763L, 6L, 90975L, 
    83103577L, 9529L, 229093L, 42810L, 5L, 18175302L, 1443751L, 5831L, 
    8303661L, 86L, 778L, 23947L, 8L, 9829740L, 2075838L, 7434328L, 
    82174987L, 2L, 94037071L, 9638653L, 5L, 3L, 65972L, 0L, 936779338L, 
    4885076L, 745L, 8L, 56456L, 125140L, 73043989L, 516476L, 7L, 
    4440739L, 612L, 3966L, 8L, 9255L, 84127L, 96218L, 5690L, 56L, 
    3561L, 78738L, 1803363L, 809369L, 7131L, 0L), High.Freq = c(6071L, 
    3210L, 6L, 7306092L, 6919054L, 666399L, 78L, 523880161L, 4700783L, 
    4173830L, 30L, 811L, 341014L, 780L, 44749L, 91L, 201620707L, 
    74L, 1L, 65422L, 595L, 89093186L, 946520L, 6940919L, 655350L, 
    4L, 6L, 618L, 2006697L, 889L, 1398L, 28769L, 90519642L, 984L, 
    0L, 296209525L, 487088392L, 5L, 894L, 529L, 5L, 99106L, 2L, 926017L, 
    9078L, 1L, 21L, 88601017L, 575770L, 48L, 8431L, 194L, 62324996L, 
    5L, 81L, 40634727L, 806901520L, 6818173L, 3501L, 91780L, 36106039L, 
    5834347L, 58388837L, 34L, 3280L, 6507606L, 19L, 402L, 584L, 76L, 
    4078684L, 199L, 6881L, 92251L, 81715L, 40L, 327L, 57764L, 97668898L, 
    2676483L, 76L, 4694L, 817120L, 51L, 116712L, 666L, 3L, 42841L, 
    9724L, 21L, 4L, 359L, 2604L, 22L, 30490L, 5640L, 34L, 51923625L, 
    35544L), Peak.Freq = c(87005561L, 9102L, 994839015L, 42745869L, 
    32840L, 62737133L, 2722L, 24L, 67404881L, 999242982L, 3048L, 
    85315406L, 703037627L, 331264L, 8403609L, 3934064L, 50578953L, 
    370110665L, 3414L, 12657L, 40L, 432L, 7707L, 214L, 68588962L, 
    69467L, 75L, 500297L, 704L, 1L, 102659072L, 60896923L, 4481230L, 
    94124925L, 60164619L, 447L, 580L, 8L, 172L, 9478521L, 20L, 53L, 
    3072127L, 2160L, 27301893L, 8L, 4263L, 508L, 712409L, 50677L, 
    522433683L, 112844L, 193385L, 458269L, 93578705L, 22093131L, 
    6L, 9L, 1690461L, 0L, 4L, 652847L, 44767L, 21408L, 5384L, 304L, 
    721L, 651147L, 2426L, 586L, 498289375L, 945L, 6L, 816L, 46207L, 
    39135L, 6621028L, 66905L, 26905085L, 4098L, 0L, 14L, 88L, 530L, 
    97809006L, 90L, 6L, 260792844L, 9L, 833205723L, 99467321L, 5L, 
    8455640L, 54090L, 2L, 309L, 299161148L, 4952L, 454824L), Delta.Freq = c(5L, 
    78L, 88553L, 794L, 5L, 3859122L, 782L, 36L, 8756801L, 243169338L, 
    817789L, 8792384L, 7431L, 626921743L, 9206L, 95789L, 7916L, 8143453L, 
    6L, 4L, 6363L, 181125L, 259618L, 6751L, 33L, 37960L, 0L, 2L, 
    599582228L, 565585L, 19L, 48L, 269450424L, 70676581L, 7830566L, 
    4L, 86484313L, 21L, 90899794L, 2L, 72356L, 574280L, 869544L, 
    73418L, 6468164L, 2259L, 5938505L, 31329L, 1249L, 354L, 8817L, 
    3L, 2568L, 82809L, 29836269L, 5230L, 37L, 33752014L, 79307L, 
    1736L, 8522076L, 40L, 2289135L, 862L, 801448L, 8026L, 5L, 15L, 
    4393771L, 405914L, 71098L, 950288L, 8319L, 1396973L, 832L, 70L, 
    1746L, 61907L, 8709547L, 300750537L, 45862L, 91417085L, 79892L, 
    47765L, 5477L, 18L, 4186L, 2860L, 754038591L, 375L, 53809223L, 
    72L, 136L, 509L, 232325L, 13128104L, 1692L, 8581L, 23L), Delta.Time = c(1361082L, 
    7926L, 499L, 5004L, 3494530L, 213L, 64551179L, 70L, 797L, 5L, 
    72588L, 86976L, 5163L, 635080L, 3L, 91L, 919806257L, 81443L, 
    3135427L, 4410972L, 5810L, 8L, 46603718L, 422L, 1083626L, 48L, 
    15699890L, 7L, 90167635L, 446459879L, 2332071L, 761660L, 49218442L, 
    381L, 46L, 493197L, 46L, 798597155L, 45342274L, 6265842L, 6L, 
    3445819L, 351L, 1761227L, 214L, 959L, 908996387L, 6L, 3855L, 
    9096604L, 152664L, 7970052L, 32366926L, 31L, 5201618L, 114L, 
    7806411L, 70L, 239L, 5065L, 2L, 1L, 14472831L, 122042249L, 8L, 
    495604L, 29L, 8965478L, 2875L, 959L, 39L, 9L, 690L, 933626665L, 
    85294L, 580093L, 95934L, 982058L, 65244056L, 137508L, 29L, 7621L, 
    7527L, 72L, 2L, 315L, 6L, 2413L, 8625150L, 51298109L, 851L, 890460L, 
    160736L, 6L, 850842734L, 2L, 7L, 76969113L, 190536L), Peak.Time = c(1465265L, 
    452894L, 545076172L, 8226275L, 5040875L, 700530L, 1L, 3639L, 
    20141L, 71712131L, 686L, 923L, 770569738L, 69961L, 737458636L, 
    122403L, 199502046L, 6108L, 907L, 108078263L, 7817L, 4L, 6L, 
    69L, 721L, 786353L, 87486L, 1563L, 876L, 47599535L, 79295722L, 
    53L, 7378L, 591L, 6607935L, 954L, 6295L, 75514344L, 5742050L, 
    25647276L, 449L, 328566184L, 4L, 2L, 2703L, 21367543L, 63429043L, 
    708L, 782L, 909820L, 478L, 50L, 922L, 579882L, 7850L, 534L, 2157492L, 
    96L, 6L, 716L, 5L, 653290336L, 447854237L, 2L, 31972263L, 645L, 
    7L, 609909L, 4054695L, 455631L, 4919894L, 9L, 72713L, 9997L, 
    84090765L, 89742L, 5L, 5028L, 4126L, 23091L, 81L, 239635020L, 
    3576L, 898597785L, 6822L, 3798L, 201999L, 19624L, 20432923L, 
    18944093L, 930720236L, 1492302L, 300122L, 143633L, 5152743L, 
    417344L, 813L, 55792L, 78L), Center_Freq = c(61907L, 8709547L, 
    300750537L, 45862L, 91417085L, 79892L, 47765L, 5477L, 18L, 4186L, 
    2860L, 754038591L, 375L, 53809223L, 72L, 136L, 4700783L, 4173830L, 
    30L, 811L, 341014L, 780L, 44749L, 91L, 201620707L, 74L, 1L, 65422L, 
    595L, 89093186L, 946520L, 6940919L, 48744L, 2317845L, 5126197L, 
    2445L, 8L, 557450L, 450259742L, 21006647L, 9L, 7234027L, 59L, 
    9L, 651547554L, 45554L, 38493L, 91055218L, 38L, 1116474L, 2295482L, 
    3001L, 9L, 3270L, 141L, 53644L, 667983L, 565598L, 84L, 971L, 
    555498297L, 60431L, 6597L, 856943893L, 607815536L, 4406L, 79L, 
    4885076L, 745L, 8L, 56456L, 125140L, 73043989L, 516476L, 7L, 
    4440739L, 754038591L, 375L, 53809223L, 72L, 136L, 509L, 232325L, 
    13128104L, 1692L, 8581L, 23L, 5874213L, 4550L, 644668065L, 3712371L, 
    5928L, 8833L, 7L, 2186023L, 61627221L, 37297L, 716427989L, 21387L
    ), Start.Freq = c(426355L, 22073538L, 680374L, 41771L, 54L, 6762844L, 
    599171L, 108L, 257451851L, 438814L, 343045L, 4702L, 967787L, 
    1937L, 18L, 89301735L, 366L, 90L, 954L, 7337732L, 70891703L, 
    4139L, 10397931L, 940000382L, 7L, 38376L, 878528819L, 6287L, 
    738366L, 31L, 47L, 5L, 6L, 77848L, 2366508L, 45L, 3665842L, 7252260L, 
    6L, 61L, 3247L, 448348L, 1L, 705132L, 144L, 7423637L, 2L, 497L, 
    844927639L, 78978L, 914L, 131L, 7089563L, 927L, 9595581L, 2774463L, 
    1651L, 73509280L, 7L, 35L, 18L, 96L, 1L, 92545512L, 27354947L, 
    7556L, 65019L, 7480L, 71835L, 8249L, 64792L, 71537L, 349389666L, 
    280244484L, 82L, 6L, 40L, 353872L, 0L, 103L, 1255L, 4752L, 29L, 
    76L, 81185L, 14L, 9L, 470775630L, 818361265L, 57947209L, 44L, 
    24L, 41295L, 4L, 261449L, 9931404L, 773556640L, 930717L, 65007421L
    ), End.Freq = c(71000996L, 11613579L, 71377155L, 1942738L, 8760748L, 
    79L, 455L, 374L, 8L, 5L, 2266932L, 597833L, 155488L, 3020L, 4L, 
    554L, 4L, 16472L, 1945649L, 668181101L, 649780L, 22394365L, 93060602L, 
    172146L, 20472L, 23558847L, 190513L, 22759044L, 44L, 78450L, 
    205621181L, 218L, 69916344L, 23884L, 66L, 312148L, 7710564L, 
    4L, 422L, 744572L, 651547554L, 45554L, 38493L, 91055218L, 38L, 
    1116474L, 2295482L, 3001L, 9L, 3270L, 141L, 55595L, 38451L, 8660867L, 
    14L, 96L, 345L, 6L, 44L, 8235824L, 910517L, 1424326L, 87102566L, 
    53644L, 667983L, 565598L, 84L, 971L, 555498297L, 60431L, 6597L, 
    856943893L, 607815536L, 4406L, 79L, 7L, 28978746L, 7537295L, 
    6L, 633L, 345860066L, 802L, 1035131L, 602L, 2740L, 8065L, 61370968L, 
    429953765L, 981507L, 8105L, 343787257L, 44782L, 64184L, 12981359L, 
    123367978L, 818775L, 123745614L, 25345654L, 3L), Country = c("Holland", 
    "Holland", "Holland", "Holland", "Holland", "Holland", "Spain", 
    "Spain", "Spain", "Spain", "Spain", "Spain", "Spain", "Spain", 
    "Spain", "Spain", "Spain", "Spain", "Holland", "Holland", "Holland", 
    "Holland", "Holland", "Holland", "France", "France", "France", 
    "France", "France", "France", "France", "France", "France", "France", 
    "France", "Spain", "Spain", "Spain", "Spain", "Spain", "Spain", 
    "Spain", "Spain", "France", "France", "France", "France", "Holland", 
    "Holland", "Holland", "Holland", "Holland", "Holland", "Holland", 
    "Holland", "Holland", "Holland", "Holland", "Holland", "Holland", 
    "Spain", "Spain", "Spain", "Spain", "Spain", "Spain", "Spain", 
    "Holland", "Holland", "Holland", "Holland", "France", "France", 
    "France", "France", "France", "France", "France", "Spain", "Spain", 
    "Spain", "Spain", "Spain", "Spain", "Spain", "Spain", "Spain", 
    "Spain", "Spain", "Spain", "Spain", "Spain", "Spain", "Spain", 
    "Spain", "Spain", "France", "France", "France")), row.names = c(NA, 
    99L), class = "data.frame")

Solution

  • Train the models:

    #Open libraries
    library(MASS)
    library(caret)
    library(e1071)
    library(klaR)
    library(gbm)
    library(earth)
    library(kernlab)
    library(rpart)
    library(randomForest)
    library(mlbench)
    library(adabag)
    library(ada)
    library(fastAdaboost)
    library(xgboost)
    library(C50)
    
    ##Produce a new version of the dataframe 'Clusters_Dummy' with the rows shuffled
    NewClusters=Cluster_Dummy_2[sample(1:nrow(Cluster_Dummy_2)),]
    
    #Produce a dataframe
    NewCluster<-as.data.frame(NewClusters)
    
    #display
    print(NewCluster)
    
    #Split the data frame into 70% to 30% train and test data
    training.parameters <- Cluster_Dummy_2$Country %>% 
    createDataPartition(p = 0.7, list = FALSE)
    train.data <- NewClusters[training.parameters, ]
    test.data <- NewClusters[-training.parameters, ]
    sapply(train.data, summary) # Look at a summary of the training data
    
    ####################################################
    #FIT MODELS: Auxiliary function to train the models
    ###################################################
    
    fitControl <- trainControl(## 10-fold CV
                              method = "repeatedcv",
                              number = 10,
                              ## repeated ten times
                              repeats = 10,
                              classProbs = TRUE,
                              verbose = TRUE)
    
    
    tuneLength <- 10
    metric <- "Accuracy"
    
    #####
    #LDA#
    #####
    #Train the model
    lda.fit.CV = train(Country ~ ., data=train.data, method="lda",
                       trControl = fitControl, metric=metric, tuneLength = tuneLength)
    
    lda.fit.CV
    
    #####################################
    # Stochastic Boosted Gradient Trees #
    #####################################
    
    gbmGrid <-  expand.grid(interaction.depth = c(1, 5, 9), 
                            n.trees = (1:30)*50, 
                            shrinkage = 0.1,
                            n.minobsinnode = 20)
    
    #Stochastic Boosted Gradient Tree: model 1
    gbmFit1 <- train(Country ~ ., data=train.data,   method = "gbm",  metric=metric, 
                     trControl = fitControl, tuneLength = tuneLength, verbose=FALSE)
    
    gbmFit1
    
    #Stochastic Boosted Gradient Tree: model 2
    gbmFit2 <- train(Country ~ ., data=train.data, method = "gbm",  trControl = fitControl, 
                     metric=metric, tuneLength = tuneLength,  tuneGrid = gbmGrid, verbose=FALSE)
    
    gbmFit2
    
    #########################################################
    # Multivariate Adaptive Regression Splines (MARS) model  
    #########################################################
    
    # Step 1: Define the tuneGrid
    marsGrid <-  expand.grid(nprune = c(2, 4, 6, 8, 10), 
                             degree = c(1, 2, 3))
    
    # Train the model using randomForest and predict on the training data itself.
    model_mars = train(Country ~ .,  data=train.data,  method='earth',  metric='ROC', tuneGrid = marsGrid, trControl = fitControl, tuneLength = tuneLength)
    
    model_mars
    
    ###############################
    # Single Vector Machine (SVM) #
    ###############################
    
    model_svmRadial = train(Country ~ ., data=train.data, method='svmRadial',, metric=metric, trControl = fitControl, tuneLength= tuneLength)
    
    model_svmRadial
    
    ###############################################
    # Recursive Partitioning Classification Trees #
    ###############################################
    
    f <- as.formula(paste0("Country ~ ", paste0(names(train.data)[2:10], collapse = "+")))
    
    rpart.ctrl <- rpart.control(minsplit = 5, minbucket = 5, cp = seq(0, 0.02, 0.0001))
    
    dt.rpart <- train(form = f, data = train.data, method = "rpart", metric = metric, trControl = fitControl, tuneGrid = rpart.ctrl, tuneLength= tuneLength)
    dt.rpart
    
    ##############
    Naive Bayes
    ############
    
    #Tune the model
    nb_tune <- data.frame(usekernel = TRUE, fL = 0, adjust=seq(0, 5, by = 1))
    
    #Train the model
    model.nb = train(Country ~., data=train.data,'nb', trControl=fitControl, metric=metric, tuneLength=tuneLength, tuneGrid = nb_tune, laplace = 0:3)
    model.nb
    
    #################
    # Random Forest #
    #################
    
    model_rf = train(Country ~., data=train.data, method='rf', metric=metric, tuneLength= tuneLength, trControl = fitControl)
    
    model_rf
    
    #####################################
    # Regularized Discriminant Analysis #
    #####################################
    
    rdaGrid=data.frame(gamma = (0.4)/4, lambda = 3/4)
    
    rdaFit <- train(Country ~ ., data =train.data, method = "rda", trControl = fitControl, tuneLength = tuneLength, metric = "ROC")
    rdaFit    
    
    #####################################
    # Classification with Decision Tree #
    #####################################
    
    #Train the model
    Decision_Fit <- train(Country ~ ., data =train.data, method = "C5.0", trControl = fitControl, tuneLength = tuneLength, metric = "ROC")
    Decision_Fit 
    
    ##################################
    # K-nearest neighbour classifier #
    ##################################
    
    #Train the model
    model_knn = caret::train(Country ~ ., data=train.data, method='knn', 
                             tuneLength = tuneLength, metric=metric, 
                             trControl = fitControl, 
                             tuneGrid = expand.grid(k = seq(1, 101, by = 2)))
    model_knn
    
    #############################
    # Neural Network Classifier #
    #############################
    
    Neural_Fit <- train(Country ~ ., data =train.data, method = "nnet", trControl = fitControl, tuneLength = tuneLength, metric = "ROC")
    Neural_Fit
    

    Predict the models against the test data

    #####
    #LDA#
    #####
    
    pred_LDA = predict(lda.fit.CV, test.data, type="prob")
    pred_LDA
    
    ######################################
    # Stochastic Boosted Gradient Trees #
    #####################################
    
    ## Stochastic Boosted Gradient Trees: model 1
    #Predict the model with the test data
    pred_model_Tree1 = predict(gbmFit1, newdata = test.data, type = "prob")
    pred_model_Tree1
    
    ## Stochastic Boosted Gradient Trees: model 1
    pred_model_Tree1$evaluation <- names(pred_model_Tree1)[apply(pred_model_Tree1, 1, which.max)]
    pred_model_Tree1$evaluation
    
    table(pred_model_Tree1$evaluation)
    
    #Now you can print the confusionMatrix (make sure each factor has the same levels)
    confusionMatrix(factor(pred_model_Tree1$evaluation, levels = unique(test.data$Country)),
                    factor(test.data$Country, levels = unique(test.data$Country)))
    
    #Predict the model with the test data
    pred_model_Tree2 = predict(gbmFit2, newdata = test.data, type = "prob")
    pred_model_Tree2
    
    ## Stochastic Boosted Gradient Trees: model 2
    pred_model_Tree2$evaluation <- names(pred_model_Tree2)[apply(pred_model_Tree2, 1, which.max)]
    pred_model_Tree2$evaluation
    
    #########################################
    # Bagged Flexible Discriminant Analysis #
    ########################################
    
    #Predict the bagged flexible discriminate model with the test data
    Earth_fitted <- predict(model_mars, newdata = test.data, type = "prob")
    Earth_fitted
    
    Earth_fitted$evaluation <- names(Earth_fitted)[apply(Earth_fitted, 1, which.max)]
    Earth_fitted$evaluation
    
    ###############################
    # Single Vector Machine (SVM) #
    ###############################
    
    #Predict the random forest model with the test data
    SVM_fitted <- predict(model_svmRadial, newdata = test.data, type = "prob")
    SVM_fitted
    
    #Evaluate the predictions
    SVM_fitted$evaluation <- names(SVM_fitted)[apply(SVM_fitted, 1, which.max)]
    SVM_fitted$evaluation
    
    ###############################################
    # Recursive Partitioning Classification Trees #
    ##############################################
    
    #Predict the random forest model with the test data
    rpart_fitted <- predict(dt.rpart, newdata = test.data, type = "prob")
    rpart_fitted
    
    #produce a dataframe
    rpart_fit<-as.data.frame(rpart_fitted)
    rpart_fit
    
    #Evaluate the predictions
    rpart_fit$evaluation <- factor(max.col(rpart_fit[,1:3]), levels=1:3, labels = c("France", "Spain", "Holland"))
    rpart_fit$evaluation 
    
    ###############
    # Naïve Bayes #
    #############
    
    #Predict the model with probabilities
    predict.nb<-predict(model.nb$finalModel, newdata = test.data, type = "prob")
    predict.nb
    
    #Predict the model with the classes
    pedict.class.nb<-predict(model.nb$finalModel, newdata = test.data, type = "prob")$class
    pedict.class.nb
    
    #Unlist the results as the function table() and confusionMatrix do not recognize lists
    unlist.predicted.nb.Country <-unlist(predict.nb$class)
    unlist.predicted.nb.Country
    
    unlist.predicted.nb.posterior <-unlist(predict.nb$posterior)
    unlist.predicted.nb.posterior
    
    #produce a dataframe
    nb_fit<-as.data.frame(unlist.predicted.nb.posterior)
    nb_fit
    
    #Evaluate the predictions
    nb_fit$evaluation <- factor(max.col(nb_fit[,1:3]), levels=1:3, labels=c(""France", "Spain", "Holland""))
    nb_fit$evaluation 
    
    #################
    # Random Forest #
    ################
    
    #Now that we have generated a classification model
    #Model Evaluation
    pred_rf<-predict(model_rf, newdata = test.data, type = "prob")
    pred_rf
    
    #Evaluate the predictions
    pred_rf$evaluation <- factor(max.col(pred_rf[,1:3]), levels=1:3, labels = c("France", "Spain", "Holland"))
    pred_rf$evaluation
    
    #####################################
    # Regularized Discriminant Analysis #
    #####################################
    
    #Now that we have generated a classification model
    #Model Evaluation
    pred_rda<-predict(rdaFit, newdata = test.data, type = "prob")
    pred_rda
    
    #Evaluate the predictions
    rdaFit $evaluation <- factor(max.col(pred_rda[,1:3]), levels=1:3, labels = c("France", "Spain", "Holland"))
    rdaFit $evaluation
    
    ##################
    # Decision Trees #
    ##################
    
    #Now that we have generated a classification model
    #Model Evaluation
    pred_decision<-predict(Decision_Fit, newdata = test.data, type = "prob")
    pred_decision
    
    #Evaluate the predictions
    pred_decision$evaluation <- factor(max.col(pred_decision[,1:3]), levels=1:3, labels = c(""France", "Spain", "Holland""))
    pred_decision$evaluation
    
    #########################
    # knn nearest neighbour #
    ########################
    
    #Now that we have generated a classification model
    #Model Evaluation
    pred_knn<-predict(model_knn, newdata = test.data, type = "prob")
    pred_knn
    
    #Evaluate the predictions
    pred_knn$evaluation <- factor(max.col(pred_knn[,1:3]), levels=1:3, labels = c("France", "Spain", "Holland"))
    pred_knn$evaluation
    
    ##################
    # Neural Network #
    ##################
    
    #Now that we have generated a classification model
    #Model Evaluation
    pred_net<-predict(Neural_Fit, newdata = test.data, type = "prob")
    pred_net
    
    #Evaluate the predictions
    pred_net$evaluation <- factor(max.col(pred_net[,1:3]), levels=1:3, labels = c("France", "Spain", "Holland"))
    pred_net$evaluation
    
    #Contingency table of predictions
    table(pred_net$evaluation)
    
    #Now you can print the confusionMatrix (make sure each factor has the same levels)
    confusionMatrix(factor(pred_net$evaluation, levels = unique(test.data$Country)),
    

    Produce the multi.roc() objects using the library pROC

    library(pROC) # Compute roc
    library(ROCR)
    library(MASS)
    library(caret)
    
    #LDA
    roc_LDA <- multiclass.roc(test.data$Country, pred_LDA, levels=c("France", "Spain", "Holland"))
    roc_LDA 
    
    #Boosted Tree model 1
    roc_Tree1 <- multiclass.roc(test.data$Country, pred_model_Tree1[1:3], levels=c("France", "Spain", "Holland"))
    roc_Tree1 
    
    #Boosted Tree model 2
    roc_Tree2 <- multiclass.roc(test.data$Country, pred_model_Tree2[1:3], levels=c("France", "Spain", "Holland"))
    roc_Tree2 
    
    #Mars model
    roc_Mars <- multiclass.roc(test.data$Country, Earth_fitted[1:3], levels=c("France", "Spain", "Holland"))
    roc_Mars
    
    #Single Vector Machine
    roc_SVM <- multiclass.roc(test.data$Country, SVM_fitted[1:3], levels=c("France", "Spain", "Holland"))
    roc_SVM
    
    #Single Vector Machine
    roc_RART <- multiclass.roc(test.data$Country, rpart_fitted, levels=c("France", "Spain", "Holland"))
    roc_RART
    
    #Naive Bayes
    roc_nb <- multiclass.roc(test.data$Country, predict.nb$posterior, levels=c("France", "Spain", "Holland"))
    roc_nb 
    
    #Single Vector Machine
    roc_RPART <- multiclass.roc(test.data$Country, rpart_fitted, levels=c("France", "Spain", "Holland"))
    roc_RPART
    
    #Random Forest
    roc_RF <- multiclass.roc(test.data$Country, pred_rf[1:3], levels=c("France", "Spain", "Holland"))
    roc_RF
    
    #Regularized Discriminant Analysis
    roc_RDA <- multiclass.roc(test.data$Country, pred_rda, levels=c("France", "Spain", "Holland"))
    roc_RDA
    
    #Decision Trees
    roc_Decision <- multiclass.roc(test.data$Country, pred_decision[1:3], levels=c("France", "Spain", "Holland"))
    roc_Decision
    
    #K nearest neighbour
    roc_knn <- multiclass.roc(test.data$Country, pred_knn[1:3], levels=c("France", "Spain", "Holland"))
    roc_knn
    
    #Neural network
    roc_net <- multiclass.roc(test.data$Country, pred_net[1:3], levels=c("France", "Spain", "Holland"))
    roc_net
    

    Function to plot the multiclass ROC curves

    ##Neural Network
    Neural_Network <- roc_net[['rocs']]
    print(Neural_Network)
    plot.roc(Neural_Network[[1:2]], las=1, lwd=1.7, col = "black", xlab=" 1 - specificity")
    
    #Nearest Neigbour
    Knn <- roc_knn[['rocs']]
    print(Knn)
    plot.roc(Knn[[1:2]], add=TRUE, lwd=1.7, col = "red")
    
    #LDA roc_LDA 
    LDA <- roc_LDA[['rocs']]
    print(rs)
    plot.roc(LDA[[1:2]], add=TRUE, lwd=1.7, col = "green")
    
    #Boosted Tree 1
    Boosted_Gradient1 <- roc_Tree1[['rocs']]
    print(Boosted_Gradient1)
    plot.roc(Boosted_Gradient1[[1:2]], add=TRUE, lwd=1.7, col = "#3300CC")
    
    #Boosted Tree 2
    Boosted_Gradient2 <- roc_Tree1[['rocs']]
    print(Boosted_Gradient2)
    plot.roc(Boosted_Gradient2[[1:2]], add=TRUE, lwd=1.7, col = "#CC66FF")
    
    #Mars
    MARS <- roc_Mars[['rocs']]
    print(MARS)
    plot.roc(MARS[[1:2]], add=TRUE, lwd=1.7, col = "#669933")
    
    #Single Vector Machine
    SVM <- roc_SVM[['rocs']]
    print(SVM)
    plot.roc(SVM[[1:2]], add=TRUE, lwd=1.7, col = "#FFFF00")
    
    #Recursive partitioning
    RPART <- roc_RPART[['rocs']]
    print(RPART)
    plot.roc(RPART[[1:2]], add=TRUE, lwd=1.7, col = "orange")
    
    #Recursive partitioning
    NaiveBayes <- roc_nb[['rocs']]
    print(NaiveBayes)
    plot.roc(NaiveBayes[[1:2]], add=TRUE, lwd=1.7, col = "cyan")
    
    #Random Forest
    Random_Forest <- roc_RF[['rocs']]
    print(Random_Forest)
    plot.roc(Random_Forest[[1:2]], add=TRUE, lwd=1.7, col = "magenta")
    
    #Regularized discriminant analysis
    RDA <- roc_RF[['rocs']]
    print(RDA)
    plot.roc(RDA [[1:2]], add=TRUE, lwd=1.7, col = "#33CC66")
    
    #Regularized discriminant analysis
    Decision_ROC <- roc_Decision[['rocs']]
    print(Decision_ROC)
    plot.roc(Decision_ROC[[1:2]], add=TRUE, lwd=1.7, col = "#3333FF")
    
    #Decision Tree
    Decision_Tree <- roc_Decision[['rocs']]
    print(Decision_Tree)
    plot.roc(Decision_Tree[[1:2]], add=TRUE, lwd=1.7, col = "#FF00CC")
    
    #Neural Network
    Neural_Network <- roc_net[['rocs']]
    print(Neural_Network)
    plot.roc(Neural_Network[[1:2]], add=TRUE, lwd=1.7, col = "#0000FF")
    

    Add a legend

    #Insert a legend
    legend("bottomright", legend = c("Neural_Network: AUC = 0.9379", "Knn: AUC =0.9087", "LDA: AUC = 0.9012", 
                                     "Boosted Gradient 1: AUC = 0.9826", "Boosted_Gradient 2: AUC = 0.9641",
                                     "MARS: AUC = 0.9458", "SVM: AUC = 0.9537", "Rpart: AUC = 0.9077", "NaiveBayes: AUC = 0.8951", 
                                     "Random_Forest: AUC = 0.9876", "RDA: AUC = 0.8982", "Decision Tree: AUC = 0.9832", 
                                     "Neural Network: AUC = 0.9379"), 
                              lty = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 
                              col =c("black", "red", "green", "#3300CC", "#CC66FF", 
                                     "#669933", "#FFFF00", "orange", "cyan", "magenta", "#33CC66", "#3333FF", 
                                     "#FF00CC", "#0000FF"), cex=0.6, lwd=1.5, inset = 0.05,
                                      title = "ROC Curves")
                              par(mfrow = c(1, 1))
                              
    

    ROC Curve Diagram

    enter image description here