Search code examples
rknnconfusion-matrix

confusionMatrix for knn classification in R


I wanted to use the optimal k value to conduct kNN clasification and predict the dependent variable diabetes in test set using train set and compare the results with the real values.

I've already got optimal k value and got the accruacy already. After that, I wanted to compare the results with the real value with using confussionMatrix but I got the problem with the different length.

I've already checked that nrow and length quantities are same(with 74) but it still have same problem.

Could you help me to overcome this problem?

My codes are as like below

install.packages("mlbench")
install.packages("gbm")

library(mlbench)
library(gbm)

data("PimaIndiansDiabetes2")
head(PimaIndiansDiabetes2)

MLdata <- as.data.frame(PimaIndiansDiabetes2)
head(MLdata)
str(MLdata)
View(MLdata)

any(is.na(MLdata))
sum(is.na(MLdata))

MLdata2 <- na.omit(MLdata)
any(is.na(MLdata2))
sum(is.na(MLdata2))
View(MLdata2)

set.seed(3333)

MLIdx <- sample(1:3, size = nrow(MLdata2), prob = c(0.6, 0.2, 0.2), replace = TRUE)

MLTrain <- MLdata2[MLIdx == 1,]
MLValid <- MLdata2[MLIdx == 2,]
MLTest <- MLdata2[MLIdx == 3,]

head(MLTrain)
head(MLValid)
head(MLTest)

str(MLTrain)
str(MLValid)
str(MLTest)

View(MLTestY)


MLTrainX <- MLTrain[ , -9]
MLValidX <- MLValid[ , -9]
MLTestX <- MLTest[ , -9]

MLTrainY <- as.data.frame(MLTrain[ , 9])
MLValidY <- as.data.frame(MLValid[ , 9])
MLTestY <- as.data.frame(MLTest[ , 9])

View(MLTrainX)
View(MLTrainY)

library(caret)

NormValues <- preProcess(MLTrainX, method = c("center", "scale"))

TrainXNormDF <- predict(NormValues, MLTrainX)
ValidXNormDF <- predict(NormValues, MLValidX)
TestXNormDF <- predict(NormValues, MLTestX)

head(TrainXNormDF)
head(ValidXNormDF)
head(TestXNormDF)


install.packages('FNN')
library(FNN)
library(class)

set.seed(3333)

NN <- knn(train = TrainXNormDF, 
      test = ValidXNormDF,
      cl = MLTrainY$`MLTrain[, 9]`,
      k = 3)

NN

Accuracy3 <- sum(NN == MLTrainY$`MLTrain[, 9]`) / length(MLTrainY$`MLTrain[, 9]`)

Accuracy3

nrow(TrainXNormDF)
length(MLTrainY$'MLTrain[, 9]')

set.seed(3333)

AccuracyK <- NULL

for(kk in c(1:nrow(TrainXNormDF))){
Knn_K <- knn(train = TrainXNormDF,
             test = ValidXNormDF,
             cl = MLTrainY$`MLTrain[, 9]`,
             k = kk)
AccuracyK <- c(AccuracyK, sum(Knn_K == MLTrainY$'MLTrain[, 9]') / length(MLTrainY$'MLTrain[, 9]'))


ValidK <- data.frame(k = c(1:nrow(TrainXNormDF)), accuracy = AccuracyK)

min(ValidK[ValidK$accuracy %in% max(AccuracyK), "k"])

plot(formula = accuracy ~ k,
 data = ValidK,
 type = "o",
 pch = 5,
 main = "Optimal K Validation")

with(ValidK, text(accuracy ~ k, labels = rownames(ValidK), pos = 2, cex = 0.5))

set.seed(3333)

NN120 <- knn(train = TrainXNormDF, 
      test = ValidXNormDF,
      cl = MLTrainY$`MLTrain[, 9]`,
      k = 120)

Accuracy120 <- sum(NN120 == MLTrainY$`MLTrain[, 9]`) / length(MLTrainY$`MLTrain[, 9]`)

Accuracy120

set.seed(3333)

FinalNN <- knn(train = TrainXNormDF, 
           test = TestXNormDF,
           cl = MLTrainY$`MLTrain[, 9]`,
           k = 120)

AccuracyFinal <- sum(FinalNN == MLTrainY$`MLTrain[, 9]`) / length(MLTrainY$`MLTrain[, 9]`)

AccuracyFinal

And here I got the problem.

Result <- confusionMatrix(FinalNN, TestXNormDF)

Solution

  • AugPelle has answered your question on how to get a confusion matrix.

    I just wanted to point out that you're calculating accuracy incorrectly throughout your code. You should calculate accuracy based on the validation set or test set, not on the training set. This is why you're getting warnings, and it's giving you incorrect answers with a non-optimal k.

    Below is the corrected code:

    install.packages("mlbench")
    install.packages("gbm")
    
    library(mlbench)
    library(gbm)
    
    data("PimaIndiansDiabetes2")
    head(PimaIndiansDiabetes2)
    
    MLdata <- as.data.frame(PimaIndiansDiabetes2)
    head(MLdata)
    str(MLdata)
    View(MLdata)
    
    any(is.na(MLdata))
    sum(is.na(MLdata))
    
    MLdata2 <- na.omit(MLdata)
    any(is.na(MLdata2))
    sum(is.na(MLdata2))
    View(MLdata2)
    
    set.seed(3333)
    
    MLIdx <- sample(1:3, size = nrow(MLdata2), prob = c(0.6, 0.2, 0.2), replace = TRUE)
    
    MLTrain <- MLdata2[MLIdx == 1,]
    MLValid <- MLdata2[MLIdx == 2,]
    MLTest <- MLdata2[MLIdx == 3,]
    
    head(MLTrain)
    head(MLValid)
    head(MLTest)
    
    str(MLTrain)
    str(MLValid)
    str(MLTest)
    
    View(MLTestY)
    
    
    MLTrainX <- MLTrain[ , -9]
    MLValidX <- MLValid[ , -9]
    MLTestX <- MLTest[ , -9]
    
    MLTrainY <- as.data.frame(MLTrain[ , 9])
    MLValidY <- as.data.frame(MLValid[ , 9])
    MLTestY <- as.data.frame(MLTest[ , 9])
    
    View(MLTrainX)
    View(MLTrainY)
    
    library(caret)
    
    NormValues <- preProcess(MLTrainX, method = c("center", "scale"))
    
    TrainXNormDF <- predict(NormValues, MLTrainX)
    ValidXNormDF <- predict(NormValues, MLValidX)
    TestXNormDF <- predict(NormValues, MLTestX)
    
    head(TrainXNormDF)
    head(ValidXNormDF)
    head(TestXNormDF)
    
    
    install.packages('FNN')
    library(FNN)
    library(class)
    
    set.seed(3333)
    
    NN <- knn(train = TrainXNormDF, 
              test = ValidXNormDF,
              cl = MLTrainY$`MLTrain[, 9]`,
              k = 3)
    
    NN
    
    Accuracy3 <- sum(NN == MLValidY$`MLValid[, 9]`) / length(MLValidY$`MLValid[, 9]`)
    
    Accuracy3
    
    nrow(TrainXNormDF)
    length(MLTrainY$'MLTrain[, 9]')
    
    set.seed(3333)
    
    AccuracyK <- NULL
    
    for(kk in c(1:nrow(TrainXNormDF))){
      Knn_K <- knn(train = TrainXNormDF,
                   test = ValidXNormDF,
                   cl = MLTrainY$`MLTrain[, 9]`,
                   k = kk)
      AccuracyK <- c(AccuracyK, sum(Knn_K == MLValidY$`MLValid[, 9]`) / length(MLValidY$`MLValid[, 9]`))}
    
      
    ValidK <- data.frame(k = c(1:nrow(TrainXNormDF)), accuracy = AccuracyK)
    
    min(ValidK[ValidK$accuracy %in% max(AccuracyK), "k"])
    
    plot(formula = accuracy ~ k,
         data = ValidK,
         type = "o",
         pch = 5,
         main = "Optimal K Validation")
    
    with(ValidK, text(accuracy ~ k, labels = rownames(ValidK), pos = 2, cex = 0.5))
    
    set.seed(3333)
    
    NN120 <- knn(train = TrainXNormDF, 
                 test = ValidXNormDF,
                 cl = MLTrainY$`MLTrain[, 9]`,
                 k = 36)
    
    Accuracy36 <- sum(NN120 == MLValidY$`MLValid[, 9]`) / length(MLValidY$`MLValid[, 9]`)
    
    Accuracy36
    
    set.seed(3333)
    
    FinalNN <- knn(train = TrainXNormDF, 
                   test = TestXNormDF,
                   cl = MLTrainY$`MLTrain[, 9]`,
                   k = 36)
    
    AccuracyFinal <- sum(FinalNN == MLTestY$`MLTest[, 9]`) / length(MLTestY$`MLTest[, 9]`)
    
    AccuracyFinal
    
    Result <- confusionMatrix(FinalNN, MLTestY$`MLTest[, 9]` )