The following coding (courtesy of E.Wiest) scrapes minor allele data from Ensembl grch38. How to improve it so that: (1) if an rs number is encountered with more than one location then data is extracted from all locations (at the moment minor allele data give NAs) and (2) the output gives three columns: "result.SNP", "result.MAF" and "result.source" and perhaps a fourth column "error" - how to add another column with chromosomal location e.g. 9:126646011. (3) not so important - but if a "HTTP error 500." occurs then even the result.SNP gives an error - so I have to check elsewhere which SNP it was ! Many thanks in advance.
library(textreadr)
library(rvest)
library(purrr)
library(dplyr)
### SNPs to look for
look=c("rs3762444", "rs284262", "rs655598", "rs12089815", "rs12140153", "rs788163", "rs1064213", "rs1106090", "rs7557796", "rs16825008")
### Function to get the data
biomart=function(x){
z=read_html(paste0("http://www.ensembl.org/homo_sapiens/Variation/Explore?v=",x))
a= z %>%
html_element(xpath = '//span[.="Highest population MAF"]/following-sibling::span/b') %>%
html_text2()
if (is.na(a)) {b<-NA_character_}else{
b = z %>%
html_element(xpath = '//span[.="Highest population MAF"]/following-sibling::span') %>%
html_attr("title") %>%
read_html() %>% html_text2()}
data.frame(SNP=x,MAF=a,source=b)
}
### Map operation
out=map(look,safely(biomart),.progress = TRUE)
yyy <- out
for (i in 1:length(yyy)) {
yyy[[i]]$error[sapply(yyy[[i]]$error, is.null)] <- NULL
yyy[[i]]$error <- as.vector(unlist(yyy[[i]]$error))
}
output=bind_rows(yyy)
output <- as.data.frame(output)
output
Updated code where multiple locations are supported, location column is added to the final dataframe, and errors are handled :
### Packages
library(rvest)
library(purrr)
library(dplyr)
### SNPs to look for
look=c("rs3762444", "rs284262", "rs655598", "rs12089815", "rs12140153", "rs788163", "rs1064213", "rs1106090", "rs7557796", "rs16825008")
### Function to get the data (with multiple locations or not)
biomart=function(x){
z=read_html(paste0("http://www.ensembl.org/homo_sapiens/Variation/Explore?v=",x))
a= z %>%
html_element(xpath = '//span[.="Highest population MAF"]/following-sibling::span/b') %>%
html_text2()
if (is.na(a)) {ver= z %>%
html_elements(xpath = "//select/option[position()>1]") %>%
html_attr("value")
urls = paste0("http://www.ensembl.org/Homo_sapiens/Variation/Explore?v=",x,"&vf=",ver)
martview=function(t){
q= read_html(t)
a= q %>%
html_element(xpath = '//span[.="Highest population MAF"]/following-sibling::span/b') %>%
html_text2()
b = q %>%
html_element(xpath = '//span[.="Highest population MAF"]/following-sibling::span') %>%
html_attr("title") %>%
read_html() %>% html_text2()
c= q %>% html_elements(xpath = '//span[@style="white-space:nowrap"]/a') %>% html_text2()
data.frame(SNP=x,location = c,MAF=a,source=b)}
temp=map(urls,martview)
bind_rows(temp)}else{
b = z %>%
html_element(xpath = '//span[.="Highest population MAF"]/following-sibling::span') %>%
html_attr("title") %>%
read_html() %>% html_text2()
c = z %>% html_element(xpath = '//span[@style="white-space:nowrap"]/a') %>%
html_text2()
data.frame(SNP=x,location = c,MAF=a,source=b)}
}
### Map operation with purrr adverb which returns NAs if errors are thrown
out=map(look,possibly(biomart,NA_character_),.progress = TRUE)
### Build the final dataframe
output=bind_rows(out)
Output :
SNP location MAF source
1 rs3762444 Chromosome 1:2496273 0.50 T in 1000GENOMES:phase_3:IBS
2 rs3762444 Scaffold HSCHR1_1_CTG3:47462 0.50 T in 1000GENOMES:phase_3:IBS
3 rs284262 Chromosome 1:10683306 0.49 T in 1000GENOMES:phase_3:SAS
4 rs655598 Chromosome 1:190318583 0.49 A in 1000GENOMES:phase_3:FIN
5 rs12089815 Chromosome 1:90724376 0.50 G in 1000GENOMES:phase_3:EUR
6 rs12140153 Chromosome 1:62114219 0.18 T in gnomADg:ami
7 rs788163 Chromosome 2:172066831 0.42 C in 1000GENOMES:phase_3:LWK
8 rs1064213 Chromosome 2:198085516 0.50 A in gnomADe:nfe
9 rs1106090 Chromosome 2:57841606 0.50 A in gnomADg:amr
10 rs7557796 Chromosome 2:86539030 0.48 T in 1000GENOMES:phase_3:SAS
11 rs16825008 Chromosome 2:211440116 0.49 A in 1000GENOMES:phase_3:ITU