Search code examples
rgeonamesrna-seq

Extracting Gene Games RNAseq DataSet in R


I have a question I can understand or solve. I downloaded GSE115262 From GEO. https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE115262. I want to extract the gene names from GSM3172784HC$annotation.gene_name. When I do this, I get numbers not the gene names. How do I get the character values? If I run Str(), this is what I get $ annotation.gene_name : Factor w/ 56233 levels "5_8S_rRNA","5S_rRNA",..: 53514 52750 11836 48738. We see I get numbers. If I run head() and look at the GSM3172784HC$annotation.gene_name, I get the gene names, this is what I want. How do I get these?

 #### Need to load in all libraries
    #General Bioconductor packages
library("GEOquery");
library("Biobase");

   # Loop Through Files for download
for(i in 1:length(tmp$V1)){
    getGEOSuppFiles(tmp$V1[i])
};

######## Healthy Controls GSE115262 ##########
## May need to read thing mult. times to get into R
GSM3172784HC<-read.table(gzfile("FilePath.txt.gz"), header=T) 

## New data-frame
HCData<- cbind(GSM3172784HC$annotation.gene_name, GSM3172784HC$expected_count);

HCData<- as.data.frame(HCData)
row.names(HCData) <- HCData$V1
colnames(HCData) <- c("HC1")

str(GSM3172784HC)
'data.frame':   57955 obs. of  11 variables:
 $ X                      : int  1 2 3 4 5 6 7 8 9 10 ...
 $ annotation.gene_id     : Factor w/ 57955 levels "ENSG00000000003",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ annotation.gene_biotype: Factor w/ 43 levels "3prime_overlapping_ncRNA",..: 20 20 20 20 20 20 20 20 20 20 ...
 $ annotation.gene_name   : Factor w/ 56233 levels "5_8S_rRNA","5S_rRNA",..: 53514 52750 11836 48738 5916 13731 7375 14125 14433 24521 ...
 $ annotation.source      : Factor w/ 4 levels "ensembl","ensembl_havana",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ transcript_id.s.       : Factor w/ 57955 levels "ENST00000000233,ENST00000415666,ENST00000459680,ENST00000463733,ENST00000467281,ENST00000489673",..: 17666 17669 17397 16695 5799 17850 14301 7 1276 12553 ...
 $ length                 : num  1749 940 1073 1538 2430 ...
 $ effective_length       : num  1623 814 947 1412 2304 ...
 $ expected_count         : num  0 0 1 1 0 2 2 0 1 1 ...
 $ TPM                    : num  0 0 0.27 0.18 0 0.23 0.07 0 0.65 0.17 ...
 $ FPKM                   : num  0 0 0.41 0.27 0 0.35 0.11 0 0.98 0.25 ...

head(GSM3172784HC)
  X annotation.gene_id annotation.gene_biotype annotation.gene_name
1 1    ENSG00000000003          protein_coding               TSPAN6
2 2    ENSG00000000005          protein_coding                 TNMD
3 3    ENSG00000000419          protein_coding                 DPM1
4 4    ENSG00000000457          protein_coding                SCYL3
5 5    ENSG00000000460          protein_coding             C1orf112
6 6    ENSG00000000938          protein_coding                  FGR
  annotation.source
1    ensembl_havana
2    ensembl_havana
3    ensembl_havana
4    ensembl_havana
5    ensembl_havana
6    ensembl_havana
                                                                                                                                 transcript_id.s.
1                                                                 ENST00000373020,ENST00000494424,ENST00000496771,ENST00000612152,ENST00000614008
2                                                                                                                 ENST00000373031,ENST00000485971
3                                                 ENST00000371582,ENST00000371584,ENST00000371588,ENST00000413082,ENST00000466152,ENST00000494752
4                                                                 ENST00000367770,ENST00000367771,ENST00000367772,ENST00000423670,ENST00000470238
5 ENST00000286031,ENST00000359326,ENST00000413811,ENST00000459772,ENST00000466580,ENST00000472795,ENST00000481744,ENST00000496973,ENST00000498289
6                                 ENST00000374003,ENST00000374004,ENST00000374005,ENST00000399173,ENST00000457296,ENST00000468038,ENST00000475472
   length effective_length expected_count  TPM FPKM
1 1749.40          1623.17              0 0.00 0.00
2  940.50           814.28              0 0.00 0.00
3 1073.00           946.77              1 0.27 0.41
4 1538.00          1411.77              1 0.18 0.27
5 2430.11          2303.88              0 0.00 0.00
6 2350.00          2223.77              2 0.23 0.35

Solution

  • We can convert the column to character

    library(dplyr)
    GSM3172784HC <- GSM3172784HC %>%
                        mutate_if(is.factor, as.character)
    

    Or with mutate/across

    GSM3172784HC <- GSM3172784HC %>%
                        mutate(across(where(is.factor), as.character))
    

    In base R, we can do

    i1 <- sapply(GSM3172784HC, is.factor)
    GSM3172784HC[i1] <- lapply(GSM3172784HC[i1], as.character)
    

    NOTE: With R >= 4.0.0, by default stringsAsFactors = FALSE