I have a large dataset of gene expression data and I'm trying to convert the gene identifiers into gene names using biomaRt in RStudio, but for some reason when I use the merge function on my data frames, my entire data table is merged wrong/erased. I've looked at the previous questions here, but no matter what I try, my code doesn't seem to work properly. Thank you infinitely!
library(biomaRt)
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "genes"
head(resdata)
## Write results
resdata <- resdata[complete.cases(resdata), ]
dim(resdata)
The problems start here:
#to convert gene accession number to gene name
charg <- resdata$genes
head(charg)
charg2 = sapply(strsplit(charg, '.', fixed=T), function(x) x[1])
ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
theBM = getBM(attributes='hgnc_symbol',
filters = 'ensembl_gene_id',
values = charg2,
mart = ensembl)
resdata <- merge.data.frame(resdata, theBM, by.x="genes",by.y="hgnc_symbol")
# a <- c(resdata[3])
# counts_resdata <-counts[resdata$ensembl_gene_id,]
# row.names(counts_resdata) <- resdata[,"V1"]
# cal_z_score <- function(x){
# (x - mean(x)) / sd(x)
# }
write.csv(resdata, file="diffexprresultsHEK.csv")
dev.off()
> dput(head(resdata))
structure(list(genes = structure(c("ENSG00000261150.2", "ENSG00000164877.18",
"ENSG00000120334.15", "ENSG00000100906.10", "ENSG00000182759.3",
"ENSG00000124145.6"), class = "AsIs"), baseMean = c(4093.85581350533,
2362.58393155573, 3727.90538524843, 6269.83601940967, 1514.2066991352,
4802.56186913745), log2FoldChange = c(-7.91660950515258, -5.26346217291626,
3.32325541003148, 2.95482654632078, -5.67082078657074, 2.79396304109662
), lfcSE = c(0.192088463317979, 0.149333035266368, 0.105355230912976,
0.097569264524605, 0.194208068005162, 0.0965853229316347), stat = c(-41.2133522670104,
-35.2464688307429, 31.5433356391815, 30.2843990955331, -29.1997178326289,
28.9274079776516), pvalue = c(0, 3.88608699685236e-272, 2.21307385030673e-218,
1.83983881587879e-201, 1.95527687476496e-187, 5.40010609376884e-184
), padj = c(0, 3.9601169541424e-268, 1.50348860477005e-214, 9.3744387266064e-198,
7.97009959691694e-184, 1.83432603828505e-180), `HEK-FUS1-1.counts` = c(8260.9703617894,
5075.51515177084, 665.085490083024, 1513.61286043731, 3440.18729968435,
1262.3583419615), `HEK-FUS1-2.counts` = c(8046.96326903085, 4134.79795973702,
690.697680591815, 1346.52518701783, 2499.92325557892, 1154.73922910593
), `HEK-H149A-1.counts` = c(34.3284200812733, 113.825813953696,
6450.12945737609, 10806.2252897945, 60.5264248801398, 8302.96076228903
), `HEK-H149A-2.counts` = c(33.1612031197744, 126.196800761364,
7105.70891294277, 11412.980740389, 56.1898163973955, 8490.18914319335
)), row.names = c(NA, 6L), class = "data.frame")
Here's some output (where I'm struggling):
> head(charg)
[1] "ENSG00000261150.2" "ENSG00000164877.18" "ENSG00000120334.15"
[4] "ENSG00000100906.10" "ENSG00000182759.3" "ENSG00000124145.6"
> dim(theBM)
[1] 0 1
> head(theBM)
[1] ensembl_gene_id
<0 rows> (or 0-length row.names)
> dim(resdata)
[1] 20381 11
> resdata <- merge.data.frame(resdata, theBM, by.x="genes",by.y="ensembl_gene_id")
> dim(resdata) #after merge
[1] 0 11 #isn't correct -- just row names! where'd my genes go?
Edit: Problems solved! Turns out I was referencing getBM
wrong. Thank you all!
If you want to just overwrite the Ensemble IDs with the HGNC IDs you can do it in one step:
library(biomaRt)
names(resdata)[1] <- "genes"
head(resdata)
## Write results
resdata <- resdata[complete.cases(resdata), ]
dim(resdata)
charg <- resdata$genes
head(charg)
charg2 = sapply(strsplit(charg, '.', fixed=T), function(x) x[1])
ensembl = useMart(biomart = "ensembl", dataset="hsapiens_gene_ensembl")
resdata[1] = getBM(attributes='hgnc_symbol',
filters = 'ensembl_gene_id',
values = charg2,
mart = ensembl)
resdata
(This keeps Log2FC as column 3, which looks right based on the next steps in your pipeline, but if you want something different let me know and I'll update my answer to suit)