Search code examples
rggplot2gtableggdendro

R: ggplot height adjustment for clustering dendrogram


The idea is to combine R packages ClustOfVar and ggdendro to give a visual summary of variable clustering.

When there are few columns in the data, the result is pretty good except that there are areas not covered(as circled in the chart below). Using mtcars for example:

library(plyr)
library(ggplot2)
library(gtable)
library(grid)
library(gridExtra)

library(ClustOfVar)
library(ggdendro)

fit = hclustvar(X.quanti = mtcars)

labels = cutree(fit,k = 5)

labelx = data.frame(Names=names(labels),group = paste("Group",as.vector(labels)),num=as.vector(labels))

p1 = ggdendrogram(as.dendrogram(fit), rotate=TRUE)

df2<-data.frame(cluster=cutree(fit, k =5), states=factor(fit$labels,levels=fit$labels[fit$order]))
df3<-ddply(df2,.(cluster),summarise,pos=mean(as.numeric(states)))

p2 = ggplot(df2,aes(states,y=1,fill=factor(cluster)))+geom_tile()+
  scale_y_continuous(expand=c(0,0))+
  theme(axis.title=element_blank(),
        axis.ticks=element_blank(),
        axis.text=element_blank(),
        legend.position="none")+coord_flip()+
  geom_text(data=df3,aes(x=pos,label=cluster))
gp1<-ggplotGrob(p1)
gp2<-ggplotGrob(p2)  
maxHeight = grid::unit.pmax(gp1$heights[2:5], gp2$heights[2:5])
gp1$heights[2:5] <- as.list(maxHeight)
gp2$heights[2:5] <- as.list(maxHeight)
grid.arrange(gp2, gp1, ncol=2,widths=c(1/6,5/6))

enter image description here

When there are a large number of columns, another issue occurs. That is, the height of the color tiles part does not match the height the dendrogram.

library(ClustOfVar)
library(ggdendro)
X = data.frame(mtcars,mtcars,mtcars,mtcars,mtcars,mtcars)

fit = hclustvar(X.quanti = X)

labels = cutree(fit,k = 5)

labelx = data.frame(Names=names(labels),group = paste("Group",as.vector(labels)),num=as.vector(labels))

p1 = ggdendrogram(as.dendrogram(fit), rotate=TRUE)

df2<-data.frame(cluster=cutree(fit, k =5), states=factor(fit$labels,levels=fit$labels[fit$order]))
df3<-ddply(df2,.(cluster),summarise,pos=mean(as.numeric(states)))

p2 = ggplot(df2,aes(states,y=1,fill=factor(cluster)))+geom_tile()+
  scale_y_continuous(expand=c(0,0))+
  theme(axis.title=element_blank(),
        axis.ticks=element_blank(),
        axis.text=element_blank(),
        legend.position="none")+coord_flip()+
  geom_text(data=df3,aes(x=pos,label=cluster))
gp1<-ggplotGrob(p1)
gp2<-ggplotGrob(p2)  
maxHeight = grid::unit.pmax(gp1$heights[2:5], gp2$heights[2:5])
gp1$heights[2:5] <- as.list(maxHeight)
gp2$heights[2:5] <- as.list(maxHeight)
grid.arrange(gp2, gp1, ncol=2,widths=c(1/6,5/6))

enter image description here

@Sandy Muspratt has actually provided an excellent solution to this IF we have the R upgraded to version 3.3.1. R: ggplot slight adjustment for clustering summary

But since I cannot change the version of the R deployed in the corporate server, I wonder if there is any other workaround that can align these two parts.


Solution

  • As far as I can tell, your code is not far wrong. The problem is that you are trying to match a continuous scale to a discrete scale when you merge the two plots. Also, it appears that ggdendrogram() adds additional space to the y-axis.

    library(plyr)
    library(ggplot2)
    library(gtable)
    library(grid)
    library(gridExtra)
    
    library(ClustOfVar)
    library(ggdendro)
    
    # Data
    X = data.frame(mtcars,mtcars,mtcars,mtcars,mtcars,mtcars)
    
    # Cluster analysis
    fit = hclustvar(X.quanti = X)
    
    # Labels data frames
    df2 <- data.frame(cluster = cutree(fit, k =5), 
         states = factor(fit$labels, levels = fit$labels[fit$order]))
    df3 <- ddply(df2, .(cluster), summarise, pos = mean(as.numeric(states)))
    
    # Dendrogram
    # scale_x_continuous() for p1 should match scale_x_discrete() from p2
    # scale_x_continuous strips off the labels. I grab them from df2
    # scale _y_continuous() puts a little space between the labels and the dendrogram
    p1 <- ggdendrogram(as.dendrogram(fit), rotate = TRUE) +
         scale_x_continuous(expand = c(0, 0.5), labels = levels(df2$states), breaks = 1:length(df2$states)) +
         scale_y_continuous(expand = c(0.02, 0)) 
    
    # Tiles and labels
    p2 <- ggplot(df2,aes(states, y = 1, fill = factor(cluster))) +
      geom_tile() +
      scale_y_continuous(expand = c(0, 0)) + 
      scale_x_discrete(expand = c(0, 0)) +
      geom_text(data = df3, aes(x = pos, label = cluster)) +
      coord_flip() +
      theme(axis.title = element_blank(),
            axis.ticks = element_blank(),
            axis.text = element_blank(),
            legend.position = "none")
    
    # Get the ggplot grobs
    gp1 <- ggplotGrob(p1)
    gp2 <- ggplotGrob(p2)  
    
    # Make sure the heights match
    maxHeight <- unit.pmax(gp1$heights, gp2$heights)
    gp1$heights <- as.list(maxHeight)
    gp2$heights <- as.list(maxHeight)
    
    # Combine the two plots
    grid.arrange(gp2, gp1, ncol = 2,widths = c(1/6, 5/6))
    

    enter image description here