Search code examples
rplotclassificationlda

How to plot classification borders on an Linear Discrimination Analysis plot in R


I have used a linear discriminant analysis (LDA) to investigate how well a set of variables discriminates between 3 groups. I then used the plot.lda() function to plot my data on the two linear discriminants (LD1 on the x-axis and LD2 on the y-axis). I would now like to add the classification borders from the LDA to the plot. I cannot see a argument in the function that allows this. The partimat() function allows visualisation of the LD classification borders, but variables are used as the x and y axes in this case, rather than the linear discriminants. Any advice on how to add classification borders to plot.lda would be greatly appreciated. Below is some example code:

library(MASS)

# LDA
t.lda = lda(Group ~ Var1 + Var2, data=mydata, 
                na.action="na.omit", CV=TRUE) 

# Scatter plot using the two discriminant dimensions 
plot(t.lda, 
     panel = function(x, y, ...) { points(x, y, ...) },
     col = c(4,2,3)[factor(mydata$Group)], 
     pch = c(17,19,15)[factor(mydata$Group)],
     ylim=c(-3,3), xlim=c(-5,5))

Below is some example data (3 groups, 2 variables):

> dput(mydata)
structure(list(Group = c("a", "a", "a", "a", "a", "a", "a", "a", 
"b", "b", "b", "b", "b", "b", "b", "b", "c", "c", "c", "c", "c", 
"c", "c", "c"), Var1 = c(7.5, 6.9, 6.5, 7.3, 8.1, 8, 7.4, 7.8, 
8.3, 8.7, 8.9, 9.3, 8.5, 9.6, 9.8, 9.7, 11.2, 10.9, 11.5, 12, 
11, 11.6, 11.7, 11.3), Var2 = c(-6.5, -6.2, -6.7, -6.9, -7.1, 
-8, -6.5, -6.3, -9.3, -9.5, -9.6, -9.1, -8.9, -8.7, -9.9, -10, 
-6.7, -6.4, -6.8, -6.1, -7.1, -8, -6.9, -6.6)), .Names = c("Group", 
"Var1", "Var2"), class = "data.frame", row.names = c(NA, -24L
))

> head(mydata)

  Group Var1 Var2
1     a  7.5 -6.5
2     a  6.9 -6.2
3     a  6.5 -6.7
4     a  7.3 -6.9
5     a  8.1 -7.1
6     a  8.0 -8.0

EDIT: Following on from Roman's answer, I tried to alter the code to plot the classification border on the linear discriminant scale (this is what I am trying to achieve), rather than on the scale of the original variables. However, the border does not sit where it should. Any advice on what I am doing wrong here would be much appreciated:

#create new data
np = 300
nd.x = seq(from = min(mydata$Var1), to = max(mydata$Var1), length.out = np)
nd.y = seq(from = min(mydata$Var2), to = max(mydata$Var2), length.out = np)
nd = expand.grid(Var1 = nd.x, Var2 = nd.y)

#run lda and predict using new data
new.lda = lda(Group ~ Var1 + Var2, data=mydata) 
prd = as.numeric(predict(new.lda, newdata = nd)$class)

#create LD sequences from min - max values 
p = predict(new.lda, newdata= nd)
p.x = seq(from = min(p$x[,1]), to = max(p$x[,1]), length.out = np) #LD1 scores
p.y = seq(from = min(p$x[,2]), to = max(p$x[,2]), length.out = np) #LD2 scores

#create original plot 
quartz()
plot(t.lda, panel = function(x, y, ...) { points(x, y, ...) },
     col = c(4,2,3)[factor(mydata$Group)], 
     pch = c(17,19,15)[factor(mydata$Group)],
     ylim=c(-3,3), xlim=c(-5,5))

#add classification border on scale of linear discriminants (NOTE: this step currently doesn't work)
contour(x = p.x, y = p.y, z = matrix(prd, nrow = np, ncol = np), 
         levels = c(1, 2, 3), add = TRUE, drawlabels = FALSE)

Solution

  • I adapted my code to follow the example found here.

    require(MASS)
    
    # generate data
    set.seed(357)
    Ng <- 100 # number of cases per group
    group.a.x <- rnorm(n = Ng, mean = 2, sd = 3)
    group.a.y <- rnorm(n = Ng, mean = 2, sd = 3)
    
    group.b.x <- rnorm(n = Ng, mean = 11, sd = 3)
    group.b.y <- rnorm(n = Ng, mean = 11, sd = 3)
    
    group.a <- data.frame(x = group.a.x, y = group.a.y, group = "A")
    group.b <- data.frame(x = group.b.x, y = group.b.y, group = "B")
    
    my.xy <- rbind(group.a, group.b)
    
    # construct the model
    mdl <- lda(group ~ x + y, data = my.xy)
    
    # draw discrimination line
    np <- 300
    nd.x <- seq(from = min(my.xy$x), to = max(my.xy$x), length.out = np)
    nd.y <- seq(from = min(my.xy$y), to = max(my.xy$y), length.out = np)
    nd <- expand.grid(x = nd.x, y = nd.y)
    
    prd <- as.numeric(predict(mdl, newdata = nd)$class)
    
    plot(my.xy[, 1:2], col = my.xy$group)
    points(mdl$means, pch = "+", cex = 3, col = c("black", "red"))
    contour(x = nd.x, y = nd.y, z = matrix(prd, nrow = np, ncol = np), 
            levels = c(1, 2), add = TRUE, drawlabels = FALSE)
    

    enter image description here

    EDIT

    If I try

    library(MASS)
    
    mydata <- structure(list(Group = c("a", "a", "a", "a", "a", "a", "a", "a", 
                                       "b", "b", "b", "b", "b", "b", "b", "b", "c", "c", "c", "c", "c", 
                                       "c", "c", "c"), Var1 = c(7.5, 6.9, 6.5, 7.3, 8.1, 8, 7.4, 7.8, 
                                                                8.3, 8.7, 8.9, 9.3, 8.5, 9.6, 9.8, 9.7, 11.2, 10.9, 11.5, 12, 
                                                                11, 11.6, 11.7, 11.3), Var2 = c(-6.5, -6.2, -6.7, -6.9, -7.1, 
                                                                                                -8, -6.5, -6.3, -9.3, -9.5, -9.6, -9.1, -8.9, -8.7, -9.9, -10, 
                                                                                                -6.7, -6.4, -6.8, -6.1, -7.1, -8, -6.9, -6.6)), .Names = c("Group", 
                                                                                                                                                           "Var1", "Var2"), class = "data.frame", row.names = c(NA, -24L
                                                                                                                                                       ))
    
    np <- 300    
    
    nd.x = seq(from = min(mydata$Var1), to = max(mydata$Var1), length.out = np)
    nd.y = seq(from = min(mydata$Var2), to = max(mydata$Var2), length.out = np)
    nd = expand.grid(Var1 = nd.x, Var2 = nd.y)
    
    #run lda and predict using new data
    new.lda = lda(Group ~ Var1 + Var2, data=mydata) 
    prd = as.numeric(predict(new.lda, newdata = nd)$class)
    
    #create LD sequences from min - max values 
    p = predict(new.lda, newdata= nd)
    p.x = seq(from = min(p$x[,1]), to = max(p$x[,1]), length.out = np) #LD1 scores
    p.y = seq(from = min(p$x[,2]), to = max(p$x[,2]), length.out = np) #LD2 scores
    
    # notice I don't use t.lda for first variable
    plot(new.lda, panel = function(x, y, ...) { points(x, y, ...) },
         col = c(4,2,3)[factor(mydata$Group)], 
         pch = c(17,19,15)[factor(mydata$Group)],
         ylim=c(-3,3), xlim=c(-5,5))
    
    contour(x = p.x, y = p.y, z = matrix(prd, nrow = np, ncol = np), 
            levels = c(1, 2, 3), add = TRUE, drawlabels = FALSE)
    

    I get

    enter image description here