Search code examples
rphyloseq

Estimate_richness for all phyla in phyloseq


Is there an easy way to get ASV richness for each Phylum for each Station using the estimate_richness function in phyloseq? Or is there another simple way of extracting the abundance data for each taxonomic rank and calculating richness that way?

So far I have just been subsetting individual Phyla of interest using for example:

ps.Prymnesiophyceae <- subset_taxa(ps, Phylum == "Prymnesiophyceae")  
alpha_diversity<-estimate_richness(ps.Prymnesiophyceae,measure=c("Shannon","Observed"))  
H<-alpha_diversity$Shannon  
S1<-alpha_diversity$Observed  
S<-log(S1)  
evenness<-H/S
alpha<-cbind(Shannon=H,Richness=S1,Evenness=evenness,sample_data(Prymnesiophyceae))

But this is rather a pain when having to do it for e.g. the top 20 phyla.

EDIT: suggestion by @GTM works well until last step. See comment + dput:

> dput(head(sample_names(ps.transect), n=2)) c("2-1-DCM_S21_L001_R1_001.fastq", "2-1-SA_S9_L001_R1_001.fastq" )
> dput(head(alpha, n=2)) structure(list(Observed = c(31, 25), Shannon = c(2.84184012598765, 
2.53358345702604), taxon = c("Prymnesiophyceae", "Prymnesiophyceae" ), sample_id = c("X2.1.DCM_S21_L001_R1_001.fastq", "X2.1.SA_S9_L001_R1_001.fastq" ), S = c(3.43398720448515,
3.2188758248682), evenness = c(0.827562817437384, 
0.787101955736294)), row.names = c("X2.1.DCM_S21_L001_R1_001.fastq",  "X2.1.SA_S9_L001_R1_001.fastq"), class = "data.frame")
> dput(head(smpl_data, n=1)) new("sample_data", .Data = list("001_DCM", 125L, structure(1L, .Label = "DCM", class = "factor"), structure(1L, .Label = "Transect", class = "factor"), structure(1L, .Label = "STZ", class = "factor"), 
    structure(1L, .Label = "STFW", class = "factor"), "Oligotrophic", 
    16L, -149.9978333, -29.997, 130.634, 17.1252, 35.4443, 1025.835008, 
    1.1968, 1e-12, 5.387, 2.8469, 52.26978546, 98.0505, 0, 0, 
    0.02, 0.9, 0, 0, 2069.47, 8.057, 377.3), names = c("Station_neat",  "Depth_our", "Depth_bin", "Loc", "Front", "Water", "Zone", "Bottle",  "Lon", "Lat", "pressure..db.", "Temperature", "Salinity", "Density_kgm.3",  "Fluorescence_ugL", "PAR", "BottleO2_mLL", "CTDO2._mLL", "OxygenSat_.",  "Beam_Transmission", "N_umolL", "NO3_umolL", "PO4_umolL", "SIL_umolL",  "NO2_umolL", "NH4_umolL", "DIC_uMkg", "pH", "pCO2_matm"), row.names = "2-1-DCM_S21_L001_R1_001.fastq", 
    .S3Class = "data.frame")

Solution

  • You can wrap your code in a for loop to do so. I've slightly modified your code to make it a bit more flexible, see below.

    require("phyloseq")
    require("dplyr")
    
    # Calculate alpha diversity measures for a specific taxon at a specified rank.
    # You can pass any parameters that you normally pass to `estimate_richness`
    estimate_diversity_for_taxon <- function(ps, taxon_name, tax_rank = "Phylum", ...){
      
      # Subset to taxon of interest
      tax_tbl <- as.data.frame(tax_table(ps))
      keep <- tax_tbl[,tax_rank] == taxon_name
      keep[is.na(keep)] <- FALSE
      ps_phylum <- prune_taxa(keep, ps)
      
      # Calculate alpha diversity and generate a table
      alpha_diversity <- estimate_richness(ps_phylum, ...)
      alpha_diversity$taxon <- taxon_name
      alpha_diversity$sample_id <- row.names(alpha_diversity)
      return(alpha_diversity)
    }
    
    # Load data
    data(GlobalPatterns)
    ps <- GlobalPatterns
    
    # Estimate alpha diversity for each phylum
    phyla <- get_taxa_unique(ps, 
                             taxonomic.rank = 'Phylum')
    phyla <- phyla[!is.na(phyla)]
    alpha <- data.frame()
    for (phylum in phyla){
      a <- estimate_diversity_for_taxon(ps = ps, 
                                        taxon_name = phylum,
                                        measure = c("Shannon", "Observed"))
      alpha <- rbind(alpha, a)
    }
    
    # Calculate the additional alpha diversity measures
    alpha$S <- log(alpha$Observed)
    alpha$evenness <- alpha$Shannon/alpha$S
    
    # Add sample data
    smpl_data <- as.data.frame(sample_data(ps))
    alpha <- left_join(alpha, 
                       smpl_data, 
                       by = c("sample_id" = "X.SampleID"))
    

    This is a reproducible example with GlobalPatterns. Make sure to alter the code to match your data by replacing X.SampleID in the left join with the name of the column that contains the sample IDs in your sample_data. If there is no such column, you can create it from the row names:

    smpl_data <- as.data.frame(sample_data(ps))
    smpl_data$sample_id < row.names(smpl_data)
    alpha <- left_join(alpha, 
                       smpl_data, 
                       by = c("sample_id" = "sample_id"))