I have a data frame with a lot of RNA seq counts (Sample names as column names and genes as row names), and a file of metadata i.e. sex, tissue type, disease status etc. (sample names as row names and sex etc and column names) I would like to a subset of the RNAseq counts data that just contains 2 of the tissues types, so that I can look at DGE. Could someone suggest the best way to do this? I'm very new at working with RNA seq data so this may be obvious!
Thank you!
Edit: There are >1000 samples so it would likely not be accurate to subset out the columns by their column names
hope this gives some insight into counts data:
dput(head(tpm.df[1:2]))
structure(list(Description = c("DDX11L1", "WASH7P", "MIR6859-1",
"MIR1302-2HG", "FAM138A", "OR4G4P"), `GTEX-1117F-0226-SM-5GZZ7` = c(0L,
187L, 0L, 1L, 0L, 0L)), row.names = c("ENSG00000223972.5",
"ENSG00000227232.5",
"ENSG00000278267.1", "ENSG00000243485.5", "ENSG00000237613.2",
"ENSG00000268020.3"), class = "data.frame")
and this is the metadata:
structure(list(SMATSSCR = c(NA, NA, NA, NA, NA, 0L), SMCENTER = c("B1",
"B1", "B1", "B1, A1", "B1, A1", "B1"), SMPTHNTS = c("", "", "",
"", "", "2 pieces, ~15% vessel stroma, rep delineated")), row.names =
c("GTEX-1117F-0003-SM-58Q7G",
"GTEX-1117F-0003-SM-5DWSB", "GTEX-1117F-0003-SM-6WBT7", "GTEX-1117F-
0011-R10a-SM-AHZ7F",
"GTEX-1117F-0011-R10b-SM-CYKQ8", "GTEX-1117F-0226-SM-5GZZ7"), class =
"data.frame")
Do you have a "Tissue" column in your "metadata" dataframe? If so, you can use this to subset your "metadata" dataframe and then use that to subset your tpm values, e.g.
tpm.df <-
structure(
list(
Description = c(
"DDX11L1",
"WASH7P",
"MIR6859-1",
"MIR1302-2HG",
"FAM138A",
"OR4G4P"
),
`GTEX-1117F-0226-SM-5GZZ7` = c(0L, 187L, 0L, 1L, 0L, 0L)
),
row.names = c(
"ENSG00000223972.5",
"ENSG00000227232.5",
"ENSG00000278267.1",
"ENSG00000243485.5",
"ENSG00000237613.2",
"ENSG00000268020.3"
),
class = "data.frame"
)
metadata <- structure(list(SMATSSCR = c(NA, NA, NA, NA, NA, 0L),
SMCENTER = c("B1", "B1", "B1", "B1, A1", "B1, A1", "B1"),
SMPTHNTS = c("", "", "", "", "", "2 pieces, ~15% vessel stroma, rep delineated"),
TISSUE = c("Adipose", "Skin", "Adipose", "Muscle", "Skin", "Nerve")),
row.names = c("GTEX-1117F-0003-SM-58Q7G", "GTEX-1117F-0003-SM-5DWSB", "GTEX-1117F-0003-SM-6WBT7", "GTEX-1117F-0011-R10a-SM-AHZ7F",
"GTEX-1117F-0011-R10b-SM-CYKQ8", "GTEX-1117F-0226-SM-5GZZ7"), class =
"data.frame")
tpm.df
#> Description GTEX-1117F-0226-SM-5GZZ7
#> ENSG00000223972.5 DDX11L1 0
#> ENSG00000227232.5 WASH7P 187
#> ENSG00000278267.1 MIR6859-1 0
#> ENSG00000243485.5 MIR1302-2HG 1
#> ENSG00000237613.2 FAM138A 0
#> ENSG00000268020.3 OR4G4P 0
metadata
#> SMATSSCR SMCENTER
#> GTEX-1117F-0003-SM-58Q7G NA B1
#> GTEX-1117F-0003-SM-5DWSB NA B1
#> GTEX-1117F-0003-SM-6WBT7 NA B1
#> GTEX-1117F-0011-R10a-SM-AHZ7F NA B1, A1
#> GTEX-1117F-0011-R10b-SM-CYKQ8 NA B1, A1
#> GTEX-1117F-0226-SM-5GZZ7 0 B1
#> SMPTHNTS
#> GTEX-1117F-0003-SM-58Q7G
#> GTEX-1117F-0003-SM-5DWSB
#> GTEX-1117F-0003-SM-6WBT7
#> GTEX-1117F-0011-R10a-SM-AHZ7F
#> GTEX-1117F-0011-R10b-SM-CYKQ8
#> GTEX-1117F-0226-SM-5GZZ7 2 pieces, ~15% vessel stroma, rep delineated
#> TISSUE
#> GTEX-1117F-0003-SM-58Q7G Adipose
#> GTEX-1117F-0003-SM-5DWSB Skin
#> GTEX-1117F-0003-SM-6WBT7 Adipose
#> GTEX-1117F-0011-R10a-SM-AHZ7F Muscle
#> GTEX-1117F-0011-R10b-SM-CYKQ8 Skin
#> GTEX-1117F-0226-SM-5GZZ7 Nerve
# One way to find samples of interest
subset_adipose_samples <- metadata[metadata$TISSUE %in% c("Adipose"),]
subset_adipose_samples
#> SMATSSCR SMCENTER SMPTHNTS TISSUE
#> GTEX-1117F-0003-SM-58Q7G NA B1 Adipose
#> GTEX-1117F-0003-SM-6WBT7 NA B1 Adipose
adipose_samples <- rownames(subset_adipose_samples)
adipose_samples
#> [1] "GTEX-1117F-0003-SM-58Q7G" "GTEX-1117F-0003-SM-6WBT7"
subset_skin_samples <- metadata[metadata$TISSUE %in% c("Skin"),]
subset_skin_samples
#> SMATSSCR SMCENTER SMPTHNTS TISSUE
#> GTEX-1117F-0003-SM-5DWSB NA B1 Skin
#> GTEX-1117F-0011-R10b-SM-CYKQ8 NA B1, A1 Skin
skin_samples <- rownames(subset_skin_samples)
skin_samples
#> [1] "GTEX-1117F-0003-SM-5DWSB" "GTEX-1117F-0011-R10b-SM-CYKQ8"
subset_tpm.df <- tpm.df[c(adipose_samples, skin_samples)]
#> Error in `[.data.frame`(tpm.df, c(adipose_samples, skin_samples)): undefined columns selected
Created on 2022-07-19 by the reprex package (v2.0.1)
NB. This example returns an error with your sample dataset because "tpm.df" only has one column, but I'm relatively sure it would work with your actual data