Search code examples
rrna-seq

How to subset dataframe in R based on another data


I have a data frame with a lot of RNA seq counts (Sample names as column names and genes as row names), and a file of metadata i.e. sex, tissue type, disease status etc. (sample names as row names and sex etc and column names) I would like to a subset of the RNAseq counts data that just contains 2 of the tissues types, so that I can look at DGE. Could someone suggest the best way to do this? I'm very new at working with RNA seq data so this may be obvious!

Thank you!

Edit: There are >1000 samples so it would likely not be accurate to subset out the columns by their column names

hope this gives some insight into counts data:

dput(head(tpm.df[1:2])) 
structure(list(Description = c("DDX11L1", "WASH7P", "MIR6859-1", 
"MIR1302-2HG", "FAM138A", "OR4G4P"), `GTEX-1117F-0226-SM-5GZZ7` = c(0L, 
187L, 0L, 1L, 0L, 0L)), row.names = c("ENSG00000223972.5", 
"ENSG00000227232.5", 
"ENSG00000278267.1", "ENSG00000243485.5", "ENSG00000237613.2", 
"ENSG00000268020.3"), class = "data.frame")

and this is the metadata:

structure(list(SMATSSCR = c(NA, NA, NA, NA, NA, 0L), SMCENTER = c("B1", 
"B1", "B1", "B1, A1", "B1, A1", "B1"), SMPTHNTS = c("", "", "", 
"", "", "2 pieces, ~15% vessel stroma, rep delineated")), row.names = 
c("GTEX-1117F-0003-SM-58Q7G", 
"GTEX-1117F-0003-SM-5DWSB", "GTEX-1117F-0003-SM-6WBT7", "GTEX-1117F- 
0011-R10a-SM-AHZ7F", 
"GTEX-1117F-0011-R10b-SM-CYKQ8", "GTEX-1117F-0226-SM-5GZZ7"), class = 
"data.frame")

Solution

  • Do you have a "Tissue" column in your "metadata" dataframe? If so, you can use this to subset your "metadata" dataframe and then use that to subset your tpm values, e.g.

    tpm.df <-
      structure(
        list(
          Description = c(
            "DDX11L1",
            "WASH7P",
            "MIR6859-1",
            "MIR1302-2HG",
            "FAM138A",
            "OR4G4P"
          ),
          `GTEX-1117F-0226-SM-5GZZ7` = c(0L, 187L, 0L, 1L, 0L, 0L)
        ),
        row.names = c(
          "ENSG00000223972.5",
          "ENSG00000227232.5",
          "ENSG00000278267.1",
          "ENSG00000243485.5",
          "ENSG00000237613.2",
          "ENSG00000268020.3"
        ),
        class = "data.frame"
      )
    
    metadata <- structure(list(SMATSSCR = c(NA, NA, NA, NA, NA, 0L), 
                               SMCENTER = c("B1", "B1", "B1", "B1, A1", "B1, A1", "B1"), 
                               SMPTHNTS = c("", "", "",  "", "", "2 pieces, ~15% vessel stroma, rep delineated"),
                               TISSUE = c("Adipose", "Skin", "Adipose", "Muscle", "Skin", "Nerve")),
                          row.names = c("GTEX-1117F-0003-SM-58Q7G", "GTEX-1117F-0003-SM-5DWSB", "GTEX-1117F-0003-SM-6WBT7", "GTEX-1117F-0011-R10a-SM-AHZ7F", 
    "GTEX-1117F-0011-R10b-SM-CYKQ8", "GTEX-1117F-0226-SM-5GZZ7"), class = 
      "data.frame")
    
    tpm.df
    #>                   Description GTEX-1117F-0226-SM-5GZZ7
    #> ENSG00000223972.5     DDX11L1                        0
    #> ENSG00000227232.5      WASH7P                      187
    #> ENSG00000278267.1   MIR6859-1                        0
    #> ENSG00000243485.5 MIR1302-2HG                        1
    #> ENSG00000237613.2     FAM138A                        0
    #> ENSG00000268020.3      OR4G4P                        0
    metadata
    #>                               SMATSSCR SMCENTER
    #> GTEX-1117F-0003-SM-58Q7G            NA       B1
    #> GTEX-1117F-0003-SM-5DWSB            NA       B1
    #> GTEX-1117F-0003-SM-6WBT7            NA       B1
    #> GTEX-1117F-0011-R10a-SM-AHZ7F       NA   B1, A1
    #> GTEX-1117F-0011-R10b-SM-CYKQ8       NA   B1, A1
    #> GTEX-1117F-0226-SM-5GZZ7             0       B1
    #>                                                                   SMPTHNTS
    #> GTEX-1117F-0003-SM-58Q7G                                                  
    #> GTEX-1117F-0003-SM-5DWSB                                                  
    #> GTEX-1117F-0003-SM-6WBT7                                                  
    #> GTEX-1117F-0011-R10a-SM-AHZ7F                                             
    #> GTEX-1117F-0011-R10b-SM-CYKQ8                                             
    #> GTEX-1117F-0226-SM-5GZZ7      2 pieces, ~15% vessel stroma, rep delineated
    #>                                TISSUE
    #> GTEX-1117F-0003-SM-58Q7G      Adipose
    #> GTEX-1117F-0003-SM-5DWSB         Skin
    #> GTEX-1117F-0003-SM-6WBT7      Adipose
    #> GTEX-1117F-0011-R10a-SM-AHZ7F  Muscle
    #> GTEX-1117F-0011-R10b-SM-CYKQ8    Skin
    #> GTEX-1117F-0226-SM-5GZZ7        Nerve
    
    # One way to find samples of interest
    subset_adipose_samples <- metadata[metadata$TISSUE %in% c("Adipose"),]
    subset_adipose_samples
    #>                          SMATSSCR SMCENTER SMPTHNTS  TISSUE
    #> GTEX-1117F-0003-SM-58Q7G       NA       B1          Adipose
    #> GTEX-1117F-0003-SM-6WBT7       NA       B1          Adipose
    adipose_samples <- rownames(subset_adipose_samples)
    adipose_samples
    #> [1] "GTEX-1117F-0003-SM-58Q7G" "GTEX-1117F-0003-SM-6WBT7"
    
    subset_skin_samples <- metadata[metadata$TISSUE %in% c("Skin"),]
    subset_skin_samples
    #>                               SMATSSCR SMCENTER SMPTHNTS TISSUE
    #> GTEX-1117F-0003-SM-5DWSB            NA       B1            Skin
    #> GTEX-1117F-0011-R10b-SM-CYKQ8       NA   B1, A1            Skin
    skin_samples <- rownames(subset_skin_samples)
    skin_samples
    #> [1] "GTEX-1117F-0003-SM-5DWSB"      "GTEX-1117F-0011-R10b-SM-CYKQ8"
    
    subset_tpm.df <- tpm.df[c(adipose_samples, skin_samples)]
    #> Error in `[.data.frame`(tpm.df, c(adipose_samples, skin_samples)): undefined columns selected
    

    Created on 2022-07-19 by the reprex package (v2.0.1)

    NB. This example returns an error with your sample dataset because "tpm.df" only has one column, but I'm relatively sure it would work with your actual data