Search code examples
ggplot2veganphyloseqrdabiplot

biplot, or plotting a partial RDA with ggplot2. what is not working with the arrows computed with var_fit?


I am trying to plot a partial RDA and some environmental parameters on top of that.

I followed the idea from this post to plot the RDA with ggplot2. I also found this other post which, instead, allows for plotting with base R. I modified it, to add the arrows for environmental parameters.

I am aware of the existence of ggvegan but, even using fortify as suggested, i am unable to get the two plots, in particular the length of the arrows, to match. What am i doing wrong?

If i look at the content of arrows, it looks like ggplot2 is doing it's job. So is it base R that is plotting arrows wrongly?

Here is the code:

library("ggplot2")
library("phyloseq")
library("vegan")
library("scales")

# load data
data(soilrep)

# add some random env vars
meta_data <- as.data.frame(sample_data(soilrep))
meta_data$env_var_one <- sample(100, nrow(meta_data), replace=T)
meta_data$env_var_two <- sample(100, nrow(meta_data), replace=T)
meta_data$env_var_three <- sample(100, nrow(meta_data), replace=T)

set.seed(1)

# compute partial RDA using formula
prda <- ordinate(soilrep, "RDA", formula=as.formula("soilrep~Treatment+warmed+Condition(clipped)"))

# compute fitting
var_fit <- envfit(prda, meta_data[, c("env_var_one", "env_var_two", "env_var_three")], perm=1000, na.rm=TRUE)

# define shapes for treatment legend
shape_legend <- c(15, 16, 17, 18)
# define shapes for treatment in plot
shape <- c(21, 22, 23, 24)
names(shape) <- unique(meta_data$Treatment)

# define colours for warmed
warmed_palette <- hue_pal()(2)[c(2, 1)]
names(warmed_palette) <- c("no", "yes")

# base plotting
dev.new()

par(mar=c(5, 5, 4, 2))
pl <- plot(prda, type="none", scaling=1)
with(meta_data, plot(var_fit, add=T, col="black", p.max=0.99))
with(meta_data, points(prda, "sites", pch=shape[Treatment], bg=warmed_palette[warmed], col="black", scaling=1))
with(meta_data, legend("topright", legend=levels(warmed), fill=warmed_palette, title="warmed"))
with(meta_data, legend("topleft", legend=levels(Treatment), pch=shape_legend, title="Treatment"))

# create data frame for ggplot plotting
# get scores for sites
ggplot_rda <- as.data.frame(vegan::scores(prda, display="sites"))

# add some columns for plotting purposes
ggplot_rda$label <- rownames(ggplot_rda)
ggplot_rda$treatment <- meta_data$Treatment
ggplot_rda$warmed <- meta_data$warmed

# as factor
ggplot_rda$treatment <- factor(ggplot_rda$treatment, levels=unique(ggplot_rda$treatment))
ggplot_rda$warmed <- factor(ggplot_rda$warmed, levels=unique(ggplot_rda$warmed))

# get scores for arrows
arrows <- as.data.frame(vegan::scores(var_fit, display = "vectors"))
arrows$names <- rownames(arrows)

# ggplotting
dev.new()

ggplot(ggplot_rda) +
    geom_point(mapping = aes(x=RDA1, y=RDA2, fill=warmed, shape=treatment)) +
    coord_fixed(ratio=1) +
    scale_shape_manual(values=shape) +
    scale_fill_manual(values=warmed_palette) +
    geom_segment(data=arrows, aes(x = 0, xend = RDA1, y = 0, yend = RDA2), arrow = arrow(length = unit(0.25, "cm"))) +
    geom_text(data=arrows, aes(x = RDA1, y = RDA2, label = names))+
    guides(fill=guide_legend(override.aes=list(shape=21)))

Solution

  • A partial solution: as mentioned in the comment, to mimic the behaviour of base R, a multiplying factor needs to be computed, to scale the arrows properly. This was also suggested in this answer from the question i linked in the original post.

    There is still a problem which i have not solved, though. Base R requires a scaling factor, that can either be 1 or 2. My solution seems to only work with scaling = 2, as i show below. I am not sure why scaling = 1 still is not working but i am investigating.

    EDIT: following the comment below, i think i solved the scaling problem. All seems to work now:

    # load libraries
    library("ggplot2")
    library("phyloseq")
    library("vegan")
    library("scales")
    
    # set scaling factor
    scaling_factor <- 1
    
    graphics.off()
    
    # load data
    data(soilrep)
    
    # add some random env vars
    set.seed(0)
    meta_data <- as(sample_data(soilrep), "data.frame")
    meta_data$env_var_one <- sample(100, nrow(meta_data), replace=T)
    meta_data$env_var_two <- sample(100, nrow(meta_data), replace=T)
    meta_data$env_var_three <- sample(100, nrow(meta_data), replace=T)
    
    # compute partial RDA using formula
    prda <- ordinate(soilrep, "RDA", formula=as.formula("soilrep~Treatment+warmed+Condition(clipped)"))
    
    # compute fitting
    var_fit <- envfit(prda, meta_data[, c("env_var_one", "env_var_two", "env_var_three")], perm=1000, na.rm=TRUE, scaling=scaling_factor)
    
    # define shapes for treatment legend
    shape_legend <- c(15, 16, 17, 18)
    # define shapes for treatment in plot
    shape <- c(21, 22, 23, 24)
    names(shape) <- unique(meta_data$Treatment)
    
    # define colours for warmed
    warmed_palette <- hue_pal()(2)[c(2, 1)]
    names(warmed_palette) <- c("no", "yes")
    
    # set levels
    # as factor
    meta_data$Treatment <- factor(meta_data$Treatment, levels=unique(meta_data$Treatment))
    meta_data$warmed <- factor(meta_data$warmed, levels=unique(meta_data$warmed))
    
    # base plotting
    dev.new()
    
    plot(prda, type="none", scaling=scaling_factor)
    with(meta_data, plot(var_fit, add=T, col="black", p.max=0.99))
    with(meta_data, points(prda, "sites", pch=shape[Treatment], bg=warmed_palette[warmed], col="black", scaling=scaling_factor))
    with(meta_data, legend("topright", legend=levels(warmed), fill=warmed_palette, title="warmed"))
    with(meta_data, legend("topleft", legend=levels(Treatment), pch=shape_legend, title="Treatment"))
    
    # create data frame for ggplot plotting
    # get scores for sites
    ggplot_rda <- as.data.frame(vegan::scores(prda, display="sites", scaling=scaling_factor))
    
    # add some columns for plotting purposes
    ggplot_rda$label <- rownames(ggplot_rda)
    ggplot_rda$treatment <- meta_data$Treatment
    ggplot_rda$warmed <- meta_data$warmed
    
    # as factor
    ggplot_rda$treatment <- factor(ggplot_rda$treatment, levels=unique(ggplot_rda$treatment))
    ggplot_rda$warmed <- factor(ggplot_rda$warmed, levels=unique(ggplot_rda$warmed))
    
    # get multiplying factors for the arrows
    arrow_factor <- ordiArrowMul(var_fit)
    # get arrows and scale them
    arrows <- as.data.frame(vegan::scores(var_fit, display = "vectors"), scaling=scaling_factor) * arrow_factor
    arrows$names <- rownames(arrows)
    
    # select arrows which have meaningful p-value, if needed
    # for this example, we keep a threshold of 0.99
    threshold_p <- 0.99
    signif_arrows <- names(which(var_fit$vectors$pvals<=threshold_p))
    # subset
    print_arrows <- arrows[signif_arrows, ]
    
    # ggplotting
    dev.new()
    
    ggplot(ggplot_rda) +
        geom_point(mapping = aes(x=RDA1, y=RDA2, fill=warmed, shape=treatment)) +
        coord_fixed() +
        scale_shape_manual(values=shape) +
        scale_fill_manual(values=warmed_palette) +
        geom_segment(data=print_arrows, aes(x = 0, xend = RDA1, y = 0, yend = RDA2), arrow = arrow(length = unit(0.25, "cm")), colour="black") +
        geom_text(data=print_arrows, aes(x = RDA1, y = RDA2, label = names))+
        guides(fill=guide_legend(override.aes=list(shape=21))) +
        theme_bw()
    

    EDIT:

    further addition since exporting seems complicated by some other factors that are introduced by plot.envfit as reported in the help page. The problem is that when plotting with base R there is a parameter, i.e. arrow.mul that i think must be set as the arrow factor used to multiply. From the help(envfit):

    The lengths of arrows for fitted vectors are automatically adjusted for the physical size of the plot, and the arrow lengths cannot be compared across plots. For similar scaling of arrows, you must explicitly set the ‘arrow.mul’ argument in the ‘plot’ command; see ‘ordiArrowMul’ and ‘ordiArrowTextXY’.

    So this is a solution which will give same plot, no matter which one method (base R or ggplot2) is used:

    # load libraries
    library("ggplot2")
    library("phyloseq")
    library("vegan")
    library("scales")
    
    # set scaling factor
    scaling_factor <- 1
    
    # load data
    data(soilrep)
    
    # add some random env vars
    set.seed(0)
    meta_data <- as(sample_data(soilrep), "data.frame")
    meta_data$env_var_one <- sample(100, nrow(meta_data), replace=T)
    meta_data$env_var_two <- sample(100, nrow(meta_data), replace=T)
    meta_data$env_var_three <- sample(100, nrow(meta_data), replace=T)
    
    # export metadata
    write.table(meta_data, "metadata_prints_to_screen.tsv", quote=F, row.names=F, sep="\t")
    
    # compute partial RDA using formula
    prda <- ordinate(soilrep, "RDA", formula=as.formula("soilrep~Treatment+warmed+Condition(clipped)"))
    
    # compute fitting
    var_fit <- envfit(prda, meta_data[, c("env_var_one", "env_var_two", "env_var_three")], perm=1000, na.rm=TRUE, scaling=scaling_factor)
    
    # get multiplying factors for the arrows
    arrow_factor <- ordiArrowMul(var_fit)
    
    # define shapes for treatment legend
    shape_legend <- c(15, 16, 17, 18)
    # define shapes for treatment in plot
    shape <- c(21, 22, 23, 24)
    names(shape) <- unique(meta_data$Treatment)
    
    # define colours for warmed
    warmed_palette <- hue_pal()(2)[c(2, 1)]
    names(warmed_palette) <- c("no", "yes")
    
    # set levels
    # as factor
    meta_data$Treatment <- factor(meta_data$Treatment, levels=unique(meta_data$Treatment))
    meta_data$warmed <- factor(meta_data$warmed, levels=unique(meta_data$warmed))
    
    # new dev
    dev.new()
    
    # plot with base R
    plot(prda, type="none", scaling=scaling_factor)
    with(meta_data, plot(var_fit, add=T, col="black", p.max=0.99, arrow.mul=arrow_factor))
    with(meta_data, points(prda, "sites", pch=shape[Treatment], bg=warmed_palette[warmed], col="black", scaling=scaling_factor))
    with(meta_data, legend("topright", legend=levels(warmed), fill=warmed_palette, title="warmed"))
    with(meta_data, legend("topleft", legend=levels(Treatment), pch=shape_legend, title="Treatment"))
    
    # create data frame for ggplot plotting
    # get scores for sites
    ggplot_rda <- as.data.frame(vegan::scores(prda, display="sites", scaling=scaling_factor))
    
    # add some columns for plotting purposes
    ggplot_rda$label <- rownames(ggplot_rda)
    ggplot_rda$treatment <- meta_data$Treatment
    ggplot_rda$warmed <- meta_data$warmed
    
    # as factor
    ggplot_rda$treatment <- factor(ggplot_rda$treatment, levels=unique(ggplot_rda$treatment))
    ggplot_rda$warmed <- factor(ggplot_rda$warmed, levels=unique(ggplot_rda$warmed))
    
    # get arrows and scale them
    arrows <- as.data.frame(vegan::scores(var_fit, display = "vectors"), scaling=scaling_factor) * arrow_factor
    arrows$names <- rownames(arrows)
    
    # select arrows which have meaningful p-value, if needed
    # for this example, we keep a threshold of 0.99
    threshold_p <- 0.99
    signif_arrows <- names(which(var_fit$vectors$pvals<=threshold_p))
    # subset
    print_arrows <- arrows[signif_arrows, ]
    
    # make the plot
    export_ggplot <- ggplot(ggplot_rda) +
        geom_point(mapping = aes(x=RDA1, y=RDA2, fill=warmed, shape=treatment)) +
        coord_fixed() +
        scale_shape_manual(values=shape) +
        scale_fill_manual(values=warmed_palette) +
        geom_segment(data=print_arrows, aes(x = 0, xend = RDA1, y = 0, yend = RDA2), arrow = arrow(length = unit(0.25, "cm")), colour="black") +
        geom_text(data=print_arrows, aes(x = RDA1, y = RDA2, label = names))+
        guides(fill=guide_legend(override.aes=list(shape=21))) +
        theme_bw()
    
    # new dev
    dev.new()
    plot(export_ggplot)