Search code examples
rcluster-analysisheatmaphierarchical-clustering

HeatMap: how to cluster only the rows and keep order of the heatmap's column labels as same as in the df?


I wanna plot a heatmap and cluster only the rows (i.e. genes in this tydf1). Also, wanna keep order of the heatmap's column labels as same as in the df (i.e. tydf1)?

Sample data

df1 <- structure(list(Gene = c("AA", "PQ", "XY", "UBQ"), X_T0_R1 = c(1.46559502, 0.220140568, 0.304127515, 1.098842127), X_T0_R2 = c(1.087642983, 0.237500819, 0.319844338, 1.256624804), X_T0_R3 = c(1.424945196, 0.21066267, 0.256496284, 1.467120048), X_T1_R1 = c(1.289943948, 0.207778662, 0.277942721, 1.238400358), X_T1_R2 = c(1.376535013, 0.488774258, 0.362562315, 0.671502431), X_T1_R3 = c(1.833390311, 0.182798731, 0.332856558, 1.448757569), X_T2_R1 = c(1.450753714, 0.247576125, 0.274415259, 1.035410946), X_T2_R2 = c(1.3094609, 0.390028842, 0.352460646, 0.946426593), X_T2_R3 = c(0.5953716, 1.007079177, 1.912258811, 0.827119776), X_T3_R1 = c(0.7906009, 0.730242116, 1.235644748, 0.832287694), X_T3_R2 = c(1.215333041, 1.012914813, 1.086362205, 1.00918082), X_T3_R3 = c(1.069312467, 0.780421013, 1.002313082, 1.031761442), Y_T0_R1 = c(0.053317766, 3.316414959, 3.617213894, 0.788193798), Y_T0_R2 = c(0.506623748, 3.599442788, 1.734075583, 1.179462912), Y_T0_R3 = c(0.713670106, 2.516735845, 1.236204882, 1.075393433), Y_T1_R1 = c(0.740998252, 1.444496448, 1.077023349, 0.869258744), Y_T1_R2 = c(0.648231834, 0.097957459, 0.791438659, 0.428805547), Y_T1_R3 = c(0.780499252, 0.187840968, 0.820430227, 0.51636582), Y_T2_R1 = c(0.35344654, 1.190274584, 0.401845911, 1.223534348), Y_T2_R2 = c(0.220223951, 1.367784148, 0.362815405, 1.102117612), Y_T2_R3 = c(0.432856978, 1.403057729, 0.10802472, 1.304233845), Y_T3_R1 = c(0.234963735, 1.232129062, 0.072433381, 1.203096462), Y_T3_R2 = c(0.353770497, 0.885122768, 0.011662112, 1.188149743), Y_T3_R3 = c(0.396091395, 1.333921747, 0.192594116, 1.838029829), Z_T0_R1 = c(0.398000559, 1.286528398, 0.129147097, 1.452769794), Z_T0_R2 = c(0.384759325, 1.122251177, 0.119475721, 1.385513609), Z_T0_R3 = c(1.582230097, 0.697419716, 2.406671502, 0.477415567), Z_T1_R1 = c(1.136843842, 0.804552001, 2.13213228, 0.989075996), Z_T1_R2 = c(1.275683837, 1.227821594, 0.31900326, 0.835941568), Z_T1_R3 = c(0.963349308, 0.968589683, 1.706670339, 0.807060135), Z_T2_R1 = c(3.765036263, 0.477443352, 1.712841882, 0.469173869), Z_T2_R2 = c(1.901023385, 0.832736132, 2.223429427, 0.593558769), Z_T2_R3 = c(1.407713024, 0.911920317, 2.011259223, 0.692553388), Z_T3_R1 = c(0.988333629, 1.095130142, 1.648598854, 0.629915612), Z_T3_R2 = c(0.618606729, 0.497458337, 0.549147265, 1.249492088), Z_T3_R3 = c(0.429823986, 0.471389536, 0.977124788, 1.136635484)), row.names = c(NA, -4L ), class = c("data.table", "data.frame"))

Scripts used

library(dplyr) 
library(stringr) 
library(tidyr) 
gdf1 <- gather(df1, "group", "Expression", -Gene) 
gdf1$tgroup <- apply(str_split_fixed(gdf1$group, "_", 3)[, c(1, 2)], 
                     1, paste, collapse ="_")

library(dplyr) 
tydf1 <- gdf1 %>% 
  group_by(Gene, tgroup) %>% 
  summarize(expression_mean = mean(Expression)) %>% 
  spread(., tgroup, expression_mean)

#1 heatmap script is being used

library(tidyverse)
tydf1 <- tydf1 %>% 
  as.data.frame() %>% 
  column_to_rownames(var=colnames(tydf1)[1]) 

library(gplots) 
library(vegan) 
randup.m <- as.matrix(tydf1) 
scaleRYG <- colorRampPalette(c("red","yellow","darkgreen"), 
                             space = "rgb")(30) 
data.dist <- vegdist(randup.m, method = "euclidean") 
row.clus <- hclust(data.dist, "aver") 
heatmap.2(randup.m, Rowv = as.dendrogram(row.clus), 
          dendrogram = "row", col = scaleRYG, margins = c(7,10), 
          density.info = "none", trace = "none", lhei = c(2,6), 
          colsep = 1:3, sepcolor = "black", sepwidth = c(0.001,0.0001), 
          xlab = "Identifier", ylab = "Rows")

#2 heatmap script is being used

df2 <- as.matrix(tydf1[, -1]) 
heatmap(df2)

Also, I want to add a color key.


Solution

  • It is still unclear to me, what the desired output is. There are some notes:

    • You don't need to use vegdist() to calculate distance matrix for your hclust() call. Because if you check all(vegdist(randup.m, method = "euclidian") == dist(randup.m)) it returns TRUE;
    • Specifying Colv = F in your heatmap.2() call will prevent reordering of the columns (default is TRUE);
    • Maybe it is better to scale your data by row (see the uncommented row);
    • Your call of heatmap.2() returns the heatmap with color key.

    So summing it up - in your first script you just miss the Colv = F argument, and after a little adjustment it looks like this:

    heatmap.2(randup.m, 
              Rowv = as.dendrogram(row.clus),
              Colv = F,
              dendrogram = "row", 
              #scale = "row", 
              col = scaleRYG,
              density.info = "none",
              trace = "none",
              srtCol = -45,
              adjCol = c(.1, .5),
              xlab = "Identifier",
              ylab = "Rows"
              )
    

    heatmap.2

    However I am still not sure - is it what you need?