Search code examples
htmlweb-scrapingbiomart

Scraping minor allele data from Ensembl - SNPs with more than one location?


The following coding (courtesy of E.Wiest) scrapes minor allele data from Ensembl grch38. How to improve it so that: (1) if an rs number is encountered with more than one location then data is extracted from all locations (at the moment minor allele data give NAs) and (2) the output gives three columns: "result.SNP", "result.MAF" and "result.source" and perhaps a fourth column "error" - how to add another column with chromosomal location e.g. 9:126646011. (3) not so important - but if a "HTTP error 500." occurs then even the result.SNP gives an error - so I have to check elsewhere which SNP it was ! Many thanks in advance.

library(textreadr)
library(rvest)
library(purrr)
library(dplyr)

### SNPs to look for
look=c("rs3762444", "rs284262", "rs655598", "rs12089815", "rs12140153", "rs788163", "rs1064213", "rs1106090", "rs7557796", "rs16825008")

### Function to get the data
biomart=function(x){
  z=read_html(paste0("http://www.ensembl.org/homo_sapiens/Variation/Explore?v=",x))
  a= z %>% 
    html_element(xpath = '//span[.="Highest population MAF"]/following-sibling::span/b') %>%
    html_text2()
  if (is.na(a)) {b<-NA_character_}else{
    b = z %>% 
    html_element(xpath = '//span[.="Highest population MAF"]/following-sibling::span') %>%
    html_attr("title") %>%
    read_html() %>% html_text2()}
  data.frame(SNP=x,MAF=a,source=b)
  }
### Map operation
out=map(look,safely(biomart),.progress = TRUE)
yyy <- out 
for (i in 1:length(yyy)) {
yyy[[i]]$error[sapply(yyy[[i]]$error, is.null)] <- NULL
yyy[[i]]$error <- as.vector(unlist(yyy[[i]]$error))
}
output=bind_rows(yyy)
output <- as.data.frame(output)
output


Solution

  • Updated code where multiple locations are supported, location column is added to the final dataframe, and errors are handled :

    ### Packages
    library(rvest)
    library(purrr)
    library(dplyr)
    
    ### SNPs to look for
    look=c("rs3762444", "rs284262", "rs655598", "rs12089815", "rs12140153", "rs788163", "rs1064213", "rs1106090", "rs7557796", "rs16825008")
    
    ### Function to get the data (with multiple locations or not)
    biomart=function(x){
      z=read_html(paste0("http://www.ensembl.org/homo_sapiens/Variation/Explore?v=",x))
      a= z %>% 
        html_element(xpath = '//span[.="Highest population MAF"]/following-sibling::span/b') %>%
        html_text2()
      if (is.na(a)) {ver= z %>%
        html_elements(xpath = "//select/option[position()>1]") %>%
        html_attr("value")
      urls = paste0("http://www.ensembl.org/Homo_sapiens/Variation/Explore?v=",x,"&vf=",ver)
      martview=function(t){
        q= read_html(t)
        a= q %>% 
          html_element(xpath = '//span[.="Highest population MAF"]/following-sibling::span/b') %>%
          html_text2()
        b = q %>% 
          html_element(xpath = '//span[.="Highest population MAF"]/following-sibling::span') %>%
          html_attr("title") %>%
          read_html() %>% html_text2()
        c= q %>% html_elements(xpath = '//span[@style="white-space:nowrap"]/a') %>% html_text2()
        data.frame(SNP=x,location = c,MAF=a,source=b)}
      temp=map(urls,martview)
      bind_rows(temp)}else{
        b = z %>% 
          html_element(xpath = '//span[.="Highest population MAF"]/following-sibling::span') %>%
          html_attr("title") %>%
          read_html() %>% html_text2()
        c = z %>% html_element(xpath = '//span[@style="white-space:nowrap"]/a') %>% 
          html_text2()
        data.frame(SNP=x,location = c,MAF=a,source=b)}
    }
    
    ### Map operation with purrr adverb which returns NAs if errors are thrown
    out=map(look,possibly(biomart,NA_character_),.progress = TRUE)
    
    ### Build the final dataframe
    output=bind_rows(out)
    

    Output :

              SNP                     location  MAF                       source
    1   rs3762444         Chromosome 1:2496273 0.50 T in 1000GENOMES:phase_3:IBS
    2   rs3762444 Scaffold HSCHR1_1_CTG3:47462 0.50 T in 1000GENOMES:phase_3:IBS
    3    rs284262        Chromosome 1:10683306 0.49 T in 1000GENOMES:phase_3:SAS
    4    rs655598       Chromosome 1:190318583 0.49 A in 1000GENOMES:phase_3:FIN
    5  rs12089815        Chromosome 1:90724376 0.50 G in 1000GENOMES:phase_3:EUR
    6  rs12140153        Chromosome 1:62114219 0.18             T in gnomADg:ami
    7    rs788163       Chromosome 2:172066831 0.42 C in 1000GENOMES:phase_3:LWK
    8   rs1064213       Chromosome 2:198085516 0.50             A in gnomADe:nfe
    9   rs1106090        Chromosome 2:57841606 0.50             A in gnomADg:amr
    10  rs7557796        Chromosome 2:86539030 0.48 T in 1000GENOMES:phase_3:SAS
    11 rs16825008       Chromosome 2:211440116 0.49 A in 1000GENOMES:phase_3:ITU