I’m having trouble scraping data from https://ser-sid.org/, a database for seed traits. I've successfully retrieved a table of potential attributes of species along with their URLs using the following code:
library(jsonlite)
library(curl)
library(httr)
library(dplyr)
# Define the API endpoint URL
url <- "https://fyxheguykvewpdeysvoh.supabase.co/rest/v1/species_summary"
# Define the API key
api_key <- "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImZ5eGhlZ3V5a3BkZXlzdm9oIiwicm9sZSI6ImFub24iLCJpYXQiOjE2NDc0MTY1MzQsImV4cCI6MTk2Mjk5MjUzNH0.XhJKVijhMUidqeTbH62zQ6r8cS6j22TYAKfbbRHMTZ8"
# Define the species list
Species <- c("Astragalus glycyphyllos", "Epirrita dilutata", "Ilybius chalconatus",
"Myrmica hirsuta", "Oenanthe aquatica", "Pterapherapteryx sexalata",
"Rhantus frontalis", "Scorpidium cossonii", "Surnia ulula", "Taraxacum tortilobum"
)
restDF <- function(content) {
# Parse the JSON content
json_data <- jsonlite::fromJSON(content)
# Convert JSON to data frame
df <- as.data.frame(json_data)
# Return the data frame
return(df)
}
# Initialize a list to store results
results <- list()
# Loop through each species
for (i in 1:length(Species)) {
# Split species into genus and epithet
genus_epithet <- strsplit(Species[i], " ")[[1]]
genus <- genus_epithet[1]
epithet <- genus_epithet[2]
# Define the query parameters for each species
params <- list(
select = "*",
genus = paste0("ilike.", genus, "%"),
epithet = paste0("ilike.", epithet, "%"),
apikey = api_key # Include the API key as a query parameter
)
# Build the full URL with query parameters
full_url <- modify_url(url, query = params)
# Make the GET request using curl_fetch_memory()
response <- curl::curl_fetch_memory(full_url)
# Check if the request was successful
if (response$status_code == 200) {
# Parse the JSON response
json_content <- rawToChar(response$content)
# Convert JSON content to data frame using restDF function
df <- restDF(json_content)
# Append data to results list
results[[Species[i]]] <- df
} else {
print(paste("Error: Failed to retrieve data for", species))
}
}
Results <- results |>
purrr::reduce(bind_rows) |>
dplyr::mutate(URL = paste0("https://ser-sid.org/species/", id))
This produces the following data frame:
genus | epithet | id | has_germination | has_oil | has_protein | has_dispersal | has_seed_weights | has_storage_behaviour | has_morphology | URL |
---|---|---|---|---|---|---|---|---|---|---|
Astragalus | glycyphyllos | e7043715-6324-415e-83f8-02d282f7b5f8 | TRUE | TRUE | TRUE | FALSE | TRUE | TRUE | FALSE | https://ser-sid.org/species/e7043715-6324-415e-83f8-02d282f7b5f8 |
Oenanthe | aquatica | bb851d35-2b67-48d1-8b0b-bda2bbd05f42 | TRUE | FALSE | FALSE | FALSE | TRUE | TRUE | FALSE | https://ser-sid.org/species/bb851d35-2b67-48d1-8b0b-bda2bbd05f42 |
Now, I'm trying to get the detailed trait values from the URLs. For example, for seed weight, the data is styled with the CSS class .text-white. However, when I attempt to scrape this data using rvest, I get an empty string.
Here is the code I’m using
library(rvest)
URL <- Results$URL[1]
Data <- read_html(URL) |>
rvest::html_elements(".font-medium.text-white") |>
rvest::html_text()
# Or
Data <- read_html(URL) |>
rvest::html_elements(".tracking-tight, .font-medium.text-white") |>
rvest::html_text()
Both approaches return an empty string. I’ve tried various CSS selectors with html_elements but to no avail. Could anyone help me figure out what I'm doing wrong or suggest a better way to scrape the required data?
Thank you in advance for your help!
I tried to do it this way
# Function to parse JSON content to a data frame
restDF <- function(content) {
json_data <- jsonlite::fromJSON(content)
df <- as.data.frame(json_data)
return(df)
}
# Function to fetch species details
fetch_species_details <- function(species_id) {
# Define the query parameters for species details
params <- list(
select = "*,family(name),germination(percent_germ,presow_treatment,temperature,light_hours,days,medium,provenance,sample_size,reference_id),dispersal(notes,method,animal_group,animal_species,dispersal_agents),seed_weights(thousandseedweight,notes,material_weighed)",
# "https://fyxheguykvewpdeysvoh.supabase.co/rest/v1/species?select=*%2Cfamily%28name%29%2Cgermination%28percent_germ%2Cpresow_treatment%2Ctemperature%2Clight_hours%2Cdays%2Cmedium%2Cprovenance%2Csample_size%2Creference_id%29%2Cdispersal%28notes%2Cmethod%2Canimal_group%2Canimal_species%2Cdispersal_agents%28description%29%2Creference_id%29%2Cstorage_behaviour%28*%29%2Cseed_weights%28thousandseedweight%2Cnotes%2Cmaterial_weighed%28description%29%2Creference_id%29%2Cprotein_content%28protein_content%2Cmoisture_status%2Cnotes%2Cmaterial_weighed%28description%29%2Creference_id%29%2Coil_content%28oil_content%2Cmoisture_status%2Cnotes%2Cmaterial_weighed%28description%29%2Creference_id%29%2Cmorphology%28*%29&id=eq.e7043715-6324-415e-83f8-02d282f7b5f8"
id = paste0("eq.", species_id),
apikey = api_key
)
# Build the full URL with query parameters
full_url <- modify_url(details_url, query = params)
# Make the GET request using curl_fetch_memory()
response <- curl::curl_fetch_memory(full_url)
# Check if the request was successful
if (response$status_code == 200) {
# Parse the JSON response
json_content <- rawToChar(response$content)
# Convert JSON content to data frame using restDF function
df <- restDF(json_content)
return(df)
} else {
print(paste("Error: Failed to retrieve data for", species_id))
return(NULL)
}
}
# Initialize a list to store summary results
summary_results <- list()
# Fetch summary data for each species
for (species in Species) {
# Split species into genus and epithet
genus_epithet <- strsplit(species, " ")[[1]]
genus <- genus_epithet[1]
epithet <- genus_epithet[2]
# Define the query parameters for species summary
params <- list(
select = "*",
genus = paste0("ilike.", genus, "%"),
epithet = paste0("ilike.", epithet, "%"),
apikey = api_key
)
# Build the full URL with query parameters
full_url <- modify_url(summary_url, query = params)
# Make the GET request using curl_fetch_memory()
response <- curl::curl_fetch_memory(full_url)
# Check if the request was successful
if (response$status_code == 200) {
# Parse the JSON response
json_content <- rawToChar(response$content)
# Convert JSON content to data frame using restDF function
df <- restDF(json_content)
# Append data to summary results list
summary_results[[species]] <- df
} else {
print(paste("Error: Failed to retrieve data for", species))
}
}
# Combine summary results into a single data frame
summary_df <- summary_results |>
purrr::reduce(bind_rows) |>
dplyr::mutate(URL = paste0("https://ser-sid.org/species/", id))
# Initialize a list to store detailed results
detailed_results <- list()
# Fetch detailed data for each species
for (i in 1:nrow(summary_df)) {
species_id <- summary_df$id[i]
details_df <- fetch_species_details(species_id)
if (!is.null(details_df)) {
detailed_results[[species_id]] <- details_df
}
}
# Combine detailed results into a single data frame
detailed_df <- detailed_results |>
purrr::reduce(bind_rows)
# Print the detailed data frame
print(detailed_df)
and it mostly works, however 2 issues
Any help on finishing this last 2 details would be great
As the database is hosted at Supabase, you can check their API doc and PostgREST API. You probably still need to figure out table names (API endpoints, i.e. /species_summary
, /species
, /seed_weights
, etc) and relations from requests and/or minified JavaScript sources, but you can still build your own queries, use all PostgREST operators and don't really need to loop though through genus-epithet pairs or IDs. PostgREST also allows you to request results in flat CSV instead of JSON (well, kind of -- some fields in DB might be in JSON and when joining with to-many relationships, PostgREST also returns JSON objects; so one might still need to parse some pieces and handle nested data)
Regarding lists and nested frames, I'm afraid you need to come up with a requirement of your own, others can then help with implementation. Just flattening or unnesting everything to long and/or wide will likely cause your frame dimensions to explode. Perhaps split that retrieved dataset into multiple frames. Or adjust queries so you'd get data from a specific table.
As an example, getting summaries and seed details for a list of species with just 2 requests might look something like this:
library(httr2)
library(dplyr)
library(tidyr)
library(readr)
library(purrr)
library(stringr)
# httr2 request object to store root node URL and API key
postgrest_req <-
request("https://fyxheguykvewpdeysvoh.supabase.co/rest/v1/") |>
req_headers(apikey = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImZ5eGhlZ3V5a3Zld3BkZXlzdm9oIiwicm9sZSI6ImFub24iLCJpYXQiOjE2NDc0MTY1MzQsImV4cCI6MTk2Mjk5MjUzNH0.XhJKVijhMUidqeTbH62zQ6r8cS6j22TYAKfbbRHMTZ8")
# request helper, returns tibble
postgrest_qry <- function(tbl, ..., req = postgrest_req){
req |>
req_url_path_append(tbl) |>
req_url_query(...) |>
# request for CSV instead of JSON
req_headers(Accept = "text/csv") |>
req_perform() |>
resp_body_string() |>
read_csv(show_col_types = FALSE)
}
# test
# 3 rows from species_summary:
postgrest_qry("species_summary", limit = 3)
#> # A tibble: 3 × 12
#> genus epithet id infraspecies_rank infraspecies_epithet has_germination
#> <chr> <chr> <chr> <chr> <chr> <lgl>
#> 1 Abies sachalinen… f625… var. mayriana FALSE
#> 2 Senna pleurocarpa 08fe… <NA> <NA> FALSE
#> 3 Senna septemtrio… 457a… <NA> <NA> TRUE
#> # ℹ 6 more variables: has_oil <lgl>, has_protein <lgl>, has_dispersal <lgl>,
#> # has_seed_weights <lgl>, has_storage_behaviour <lgl>, has_morphology <lgl>
# WHERE genus ILIKE '...%'
postgrest_qry("species_summary", genus = "ilike.astra*", limit = 3)
#> # A tibble: 3 × 12
#> genus epithet id infraspecies_rank infraspecies_epithet has_germination
#> <chr> <chr> <chr> <chr> <chr> <lgl>
#> 1 Astragal… nervul… b06d… <NA> <NA> FALSE
#> 2 Astragal… steven… 10ec… subsp. meskheticus FALSE
#> 3 Astragal… subuli… f669… <NA> <NA> FALSE
#> # ℹ 6 more variables: has_oil <lgl>, has_protein <lgl>, has_dispersal <lgl>,
#> # has_seed_weights <lgl>, has_storage_behaviour <lgl>, has_morphology <lgl>
# WHERE genus IN (...)
postgrest_qry("species_summary", genus = "in.(Oenanthe,Scorpidium)", limit = 3)
#> # A tibble: 3 × 12
#> genus epithet id infraspecies_rank infraspecies_epithet has_germination
#> <chr> <chr> <chr> <lgl> <lgl> <lgl>
#> 1 Oenanthe aquatica bb85… NA NA TRUE
#> 2 Oenanthe crocata 4de9… NA NA TRUE
#> 3 Oenanthe fistulo… bd7c… NA NA TRUE
#> # ℹ 6 more variables: has_oil <lgl>, has_protein <lgl>, has_dispersal <lgl>,
#> # has_seed_weights <lgl>, has_storage_behaviour <lgl>, has_morphology <lgl>
# generated query will be a bit lenghty, so let's just use 3 spieces for an example
Species <- c("Astragalus glycyphyllos", "Epirrita dilutata", "Oenanthe aquatica")
# build a query to get summary for all genus-epithet ilike pairs
q_and <-
Species |>
str_split(" ") |>
map_chr(\(x) str_glue("and(genus.ilike.{x[1]}*,epithet.ilike.{x[2]}*)")) |>
str_c(collapse = ",")
q_or <- str_glue("({q_and})")
str_c("or=",q_or)
#> [1] "or=(and(genus.ilike.Astragalus*,epithet.ilike.glycyphyllos*),and(genus.ilike.Epirrita*,epithet.ilike.dilutata*),and(genus.ilike.Oenanthe*,epithet.ilike.aquatica*))"
(summary_df <- postgrest_qry("species_summary", or = q_or))
#> # A tibble: 2 × 12
#> genus epithet id infraspecies_rank infraspecies_epithet has_germination
#> <chr> <chr> <chr> <lgl> <lgl> <lgl>
#> 1 Oenanthe aquati… bb85… NA NA TRUE
#> 2 Astragal… glycyp… e704… NA NA TRUE
#> # ℹ 6 more variables: has_oil <lgl>, has_protein <lgl>, has_dispersal <lgl>,
#> # has_seed_weights <lgl>, has_storage_behaviour <lgl>, has_morphology <lgl>
# check species & seed_weights tables
postgrest_qry("species", limit = 3) |> glimpse()
#> Rows: 3
#> Columns: 16
#> $ id <chr> "f625934c-fa30-475a-9189-22463d04698e", "08fe8c…
#> $ int_id <dbl> 40, 21078, 21086
#> $ family <dbl> 4, 50, 50
#> $ genus <chr> "Abies", "Senna", "Senna"
#> $ epithet <chr> "sachalinensis", "pleurocarpa", "septemtrionali…
#> $ infraspecies_epithet <chr> "mayriana", NA, NA
#> $ authority <chr> "(F.Schmidt) Mast.", "(F.Muell.) Randell", "(Vi…
#> $ infraspecies_rank <chr> "var.", NA, NA
#> $ infraspecies_authority <chr> "Miyabe & Kudô", NA, NA
#> $ synonyms <chr> "Abies mayriana", "Cassia pleurocarpa F.V.M.", …
#> $ common_name <lgl> NA, NA, NA
#> $ lifeform <chr> "phan.", "nanophan.", "nanophan."
#> $ rank3 <lgl> NA, NA, NA
#> $ sp3 <lgl> NA, NA, NA
#> $ author3 <lgl> NA, NA, NA
#> $ binomial <chr> "'abi':1 'sachalinensi':2", "'pleurocarpa':2 's…
postgrest_qry("seed_weights", limit = 3) |> glimpse()
#> Rows: 3
#> Columns: 9
#> $ id <dbl> 60755, 60756, 60765
#> $ species_id <dbl> 13, 15, 20
#> $ thousandseedweight <dbl> 28.6, 34.5, 20.0
#> $ materialweighed_id <dbl> 1, 1, 1
#> $ notes <lgl> NA, NA, NA
#> $ reference_id <dbl> 1380, 1380, 1380
#> $ precision <dbl> 1, 1, 1
#> $ cultivar <lgl> NA, NA, NA
#> $ serialnum <lgl> NA, NA, NA
# it's probably safe to assume that seed_weights.species_id is a foregin key to species,
# so we can embed those resources / join tables
# get seed weights for all id-s in summary_df
(id_q <- str_c("in.(", str_c(summary_df$id, collapse = ","), ")"))
#> [1] "in.(bb851d35-2b67-48d1-8b0b-bda2bbd05f42,e7043715-6324-415e-83f8-02d282f7b5f8)"
# id,int_id,genus,epithet from species joined with seed_weights table
(seed_weights_df <- postgrest_qry("species", id = id_q, select = "id,int_id,genus,epithet,seed_weights(*)"))
#> # A tibble: 2 × 5
#> id int_id genus epithet seed_weights
#> <chr> <dbl> <chr> <chr> <chr>
#> 1 bb851d35-2b67-48d1-8b0b-bda2bbd05f42 16286 Oenanthe aquatica "[{\"id\":9…
#> 2 e7043715-6324-415e-83f8-02d282f7b5f8 2649 Astragalus glycyphyl… "[{\"id\":7…
# one-to-many realtionship, PostgREST handles this through JSON objects even though we have asked for CSV;
# parse JSON column (row-wise), unnest
seed_weights_df |>
rowwise() |>
mutate(w = jsonlite::parse_json(seed_weights, simplifyVector = TRUE) |> list(), .keep = "unused") |>
unnest(w, names_sep = ".")
#> # A tibble: 18 × 13
#> id int_id genus epithet w.id w.species_id w.thousandseedweight
#> <chr> <dbl> <chr> <chr> <int> <int> <dbl>
#> 1 bb851d35-2b67-… 16286 Oena… aquati… 90114 16286 2.19
#> 2 bb851d35-2b67-… 16286 Oena… aquati… 90159 16286 2.41
#> 3 bb851d35-2b67-… 16286 Oena… aquati… 139102 16286 2.16
#> 4 e7043715-6324-… 2649 Astr… glycyp… 79094 2649 4.5
#> 5 e7043715-6324-… 2649 Astr… glycyp… 79099 2649 7.17
#> 6 e7043715-6324-… 2649 Astr… glycyp… 85515 2649 4.22
#> 7 e7043715-6324-… 2649 Astr… glycyp… 85639 2649 4.05
#> 8 e7043715-6324-… 2649 Astr… glycyp… 87647 2649 6.05
#> 9 e7043715-6324-… 2649 Astr… glycyp… 88668 2649 5.07
#> 10 e7043715-6324-… 2649 Astr… glycyp… 88909 2649 5.23
#> 11 e7043715-6324-… 2649 Astr… glycyp… 92388 2649 3.50
#> 12 e7043715-6324-… 2649 Astr… glycyp… 104280 2649 3.00
#> 13 e7043715-6324-… 2649 Astr… glycyp… 105741 2649 3.53
#> 14 e7043715-6324-… 2649 Astr… glycyp… 142103 2649 5.86
#> 15 e7043715-6324-… 2649 Astr… glycyp… 62507 2649 4.5
#> 16 e7043715-6324-… 2649 Astr… glycyp… 62508 2649 4.54
#> 17 e7043715-6324-… 2649 Astr… glycyp… 77077 2649 4.87
#> 18 e7043715-6324-… 2649 Astr… glycyp… 84292 2649 4.66
#> # ℹ 6 more variables: w.materialweighed_id <int>, w.notes <chr>,
#> # w.reference_id <int>, w.precision <chr>, w.cultivar <lgl>,
#> # w.serialnum <chr>
Created on 2024-05-28 with reprex v2.1.0