Trouble Scraping Dynamic Content from SER-SID.org with rvest in R

I’m having trouble scraping data from https://ser-sid.org/, a database for seed traits. I've successfully retrieved a table of potential attributes of species along with their URLs using the following code:

library(jsonlite)
library(curl)
library(httr)
library(dplyr)

# Define the API endpoint URL
url <- "https://fyxheguykvewpdeysvoh.supabase.co/rest/v1/species_summary"

# Define the API key
api_key <- "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImZ5eGhlZ3V5a3BkZXlzdm9oIiwicm9sZSI6ImFub24iLCJpYXQiOjE2NDc0MTY1MzQsImV4cCI6MTk2Mjk5MjUzNH0.XhJKVijhMUidqeTbH62zQ6r8cS6j22TYAKfbbRHMTZ8"

# Define the species list
Species <- c("Astragalus glycyphyllos", "Epirrita dilutata", "Ilybius chalconatus", 
"Myrmica hirsuta", "Oenanthe aquatica", "Pterapherapteryx sexalata", 
"Rhantus frontalis", "Scorpidium cossonii", "Surnia ulula", "Taraxacum tortilobum"
)

restDF <- function(content) { 
  # Parse the JSON content
  json_data <- jsonlite::fromJSON(content)
  
  # Convert JSON to data frame
  df <- as.data.frame(json_data)
  
  # Return the data frame
  return(df)
}

# Initialize a list to store results
results <- list()

# Loop through each species
for (i in 1:length(Species)) {
  # Split species into genus and epithet
  genus_epithet <- strsplit(Species[i], " ")[[1]]
  genus <- genus_epithet[1]
  epithet <- genus_epithet[2]
  
  # Define the query parameters for each species
  params <- list(
    select = "*",
    genus = paste0("ilike.", genus, "%"),
    epithet = paste0("ilike.", epithet, "%"),
    apikey = api_key  # Include the API key as a query parameter
  )
  
  # Build the full URL with query parameters
  full_url <- modify_url(url, query = params)
  
  # Make the GET request using curl_fetch_memory()
  response <- curl::curl_fetch_memory(full_url)
  
  # Check if the request was successful
  if (response$status_code == 200) {
    # Parse the JSON response
    json_content <- rawToChar(response$content)
    
    # Convert JSON content to data frame using restDF function
    df <- restDF(json_content)
    # Append data to results list
    results[[Species[i]]] <- df
  } else {
    print(paste("Error: Failed to retrieve data for", species))
  }
}

Results <- results |> 
  purrr::reduce(bind_rows) |> 
  dplyr::mutate(URL = paste0("https://ser-sid.org/species/", id))

This produces the following data frame:

genus	epithet	id	has_germination	has_oil	has_protein	has_dispersal	has_seed_weights	has_storage_behaviour	has_morphology	URL
Astragalus	glycyphyllos	e7043715-6324-415e-83f8-02d282f7b5f8	TRUE	TRUE	TRUE	FALSE	TRUE	TRUE	FALSE	https://ser-sid.org/species/e7043715-6324-415e-83f8-02d282f7b5f8
Oenanthe	aquatica	bb851d35-2b67-48d1-8b0b-bda2bbd05f42	TRUE	FALSE	FALSE	FALSE	TRUE	TRUE	FALSE	https://ser-sid.org/species/bb851d35-2b67-48d1-8b0b-bda2bbd05f42

Now, I'm trying to get the detailed trait values from the URLs. For example, for seed weight, the data is styled with the CSS class .text-white. However, when I attempt to scrape this data using rvest, I get an empty string.

Here is the code I’m using

library(rvest)

URL <- Results$URL[1]

Data <- read_html(URL) |> 
  rvest::html_elements(".font-medium.text-white") |> 
  rvest::html_text()

# Or

Data <- read_html(URL) |> 
  rvest::html_elements(".tracking-tight, .font-medium.text-white") |> 
  rvest::html_text()

Both approaches return an empty string. I’ve tried various CSS selectors with html_elements but to no avail. Could anyone help me figure out what I'm doing wrong or suggest a better way to scrape the required data?

Thank you in advance for your help!

Edit based on @margusl coment

I tried to do it this way

# Function to parse JSON content to a data frame
restDF <- function(content) { 
  json_data <- jsonlite::fromJSON(content)
  df <- as.data.frame(json_data)
  return(df)
}

# Function to fetch species details
fetch_species_details <- function(species_id) {
  # Define the query parameters for species details
  params <- list(
    select = "*,family(name),germination(percent_germ,presow_treatment,temperature,light_hours,days,medium,provenance,sample_size,reference_id),dispersal(notes,method,animal_group,animal_species,dispersal_agents),seed_weights(thousandseedweight,notes,material_weighed)",
#    "https://fyxheguykvewpdeysvoh.supabase.co/rest/v1/species?select=*%2Cfamily%28name%29%2Cgermination%28percent_germ%2Cpresow_treatment%2Ctemperature%2Clight_hours%2Cdays%2Cmedium%2Cprovenance%2Csample_size%2Creference_id%29%2Cdispersal%28notes%2Cmethod%2Canimal_group%2Canimal_species%2Cdispersal_agents%28description%29%2Creference_id%29%2Cstorage_behaviour%28*%29%2Cseed_weights%28thousandseedweight%2Cnotes%2Cmaterial_weighed%28description%29%2Creference_id%29%2Cprotein_content%28protein_content%2Cmoisture_status%2Cnotes%2Cmaterial_weighed%28description%29%2Creference_id%29%2Coil_content%28oil_content%2Cmoisture_status%2Cnotes%2Cmaterial_weighed%28description%29%2Creference_id%29%2Cmorphology%28*%29&id=eq.e7043715-6324-415e-83f8-02d282f7b5f8"
    id = paste0("eq.", species_id),
    apikey = api_key
  )
  
  # Build the full URL with query parameters
  full_url <- modify_url(details_url, query = params)
  
  # Make the GET request using curl_fetch_memory()
  response <- curl::curl_fetch_memory(full_url)
  
  # Check if the request was successful
  if (response$status_code == 200) {
    # Parse the JSON response
    json_content <- rawToChar(response$content)
    
    # Convert JSON content to data frame using restDF function
    df <- restDF(json_content)
    return(df)
  } else {
    print(paste("Error: Failed to retrieve data for", species_id))
    return(NULL)
  }
}

# Initialize a list to store summary results
summary_results <- list()

# Fetch summary data for each species
for (species in Species) {
  # Split species into genus and epithet
  genus_epithet <- strsplit(species, " ")[[1]]
  genus <- genus_epithet[1]
  epithet <- genus_epithet[2]
  
  # Define the query parameters for species summary
  params <- list(
    select = "*",
    genus = paste0("ilike.", genus, "%"),
    epithet = paste0("ilike.", epithet, "%"),
    apikey = api_key
  )
  
  # Build the full URL with query parameters
  full_url <- modify_url(summary_url, query = params)
  
  # Make the GET request using curl_fetch_memory()
  response <- curl::curl_fetch_memory(full_url)
  
  # Check if the request was successful
  if (response$status_code == 200) {
    # Parse the JSON response
    json_content <- rawToChar(response$content)
    
    # Convert JSON content to data frame using restDF function
    df <- restDF(json_content)
    # Append data to summary results list
    summary_results[[species]] <- df
  } else {
    print(paste("Error: Failed to retrieve data for", species))
  }
}

# Combine summary results into a single data frame
summary_df <- summary_results |> 
  purrr::reduce(bind_rows) |> 
  dplyr::mutate(URL = paste0("https://ser-sid.org/species/", id))

# Initialize a list to store detailed results
detailed_results <- list()

# Fetch detailed data for each species
for (i in 1:nrow(summary_df)) {
  species_id <- summary_df$id[i]
  details_df <- fetch_species_details(species_id)
  
  if (!is.null(details_df)) {
    detailed_results[[species_id]] <- details_df
  }
}

# Combine detailed results into a single data frame
detailed_df <- detailed_results |> 
  purrr::reduce(bind_rows)

# Print the detailed data frame
print(detailed_df)

and it mostly works, however 2 issues

Weight does not appear
Data frame has variables that are a list

Any help on finishing this last 2 details would be great

Solution

As the database is hosted at Supabase, you can check their API doc and PostgREST API. You probably still need to figure out table names (API endpoints, i.e. /species_summary , /species, /seed_weights, etc) and relations from requests and/or minified JavaScript sources, but you can still build your own queries, use all PostgREST operators and don't really need to loop though through genus-epithet pairs or IDs. PostgREST also allows you to request results in flat CSV instead of JSON (well, kind of -- some fields in DB might be in JSON and when joining with to-many relationships, PostgREST also returns JSON objects; so one might still need to parse some pieces and handle nested data)

Regarding lists and nested frames, I'm afraid you need to come up with a requirement of your own, others can then help with implementation. Just flattening or unnesting everything to long and/or wide will likely cause your frame dimensions to explode. Perhaps split that retrieved dataset into multiple frames. Or adjust queries so you'd get data from a specific table.

As an example, getting summaries and seed details for a list of species with just 2 requests might look something like this:

library(httr2)
library(dplyr)
library(tidyr)
library(readr)
library(purrr)
library(stringr)

# httr2 request object to store root node URL and API key
postgrest_req <- 
  request("https://fyxheguykvewpdeysvoh.supabase.co/rest/v1/") |>
  req_headers(apikey = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImZ5eGhlZ3V5a3Zld3BkZXlzdm9oIiwicm9sZSI6ImFub24iLCJpYXQiOjE2NDc0MTY1MzQsImV4cCI6MTk2Mjk5MjUzNH0.XhJKVijhMUidqeTbH62zQ6r8cS6j22TYAKfbbRHMTZ8")

# request helper, returns tibble
postgrest_qry <- function(tbl, ..., req = postgrest_req){
  req |>
    req_url_path_append(tbl) |>
    req_url_query(...) |>
    # request for CSV instead of JSON
    req_headers(Accept = "text/csv") |>
    req_perform() |>
    resp_body_string() |>
    read_csv(show_col_types = FALSE)
}
# test
# 3 rows from species_summary:
postgrest_qry("species_summary", limit = 3)
#> # A tibble: 3 × 12
#>   genus epithet     id    infraspecies_rank infraspecies_epithet has_germination
#>   <chr> <chr>       <chr> <chr>             <chr>                <lgl>          
#> 1 Abies sachalinen… f625… var.              mayriana             FALSE          
#> 2 Senna pleurocarpa 08fe… <NA>              <NA>                 FALSE          
#> 3 Senna septemtrio… 457a… <NA>              <NA>                 TRUE           
#> # ℹ 6 more variables: has_oil <lgl>, has_protein <lgl>, has_dispersal <lgl>,
#> #   has_seed_weights <lgl>, has_storage_behaviour <lgl>, has_morphology <lgl>

# WHERE genus ILIKE '...%'
postgrest_qry("species_summary", genus = "ilike.astra*", limit = 3)
#> # A tibble: 3 × 12
#>   genus     epithet id    infraspecies_rank infraspecies_epithet has_germination
#>   <chr>     <chr>   <chr> <chr>             <chr>                <lgl>          
#> 1 Astragal… nervul… b06d… <NA>              <NA>                 FALSE          
#> 2 Astragal… steven… 10ec… subsp.            meskheticus          FALSE          
#> 3 Astragal… subuli… f669… <NA>              <NA>                 FALSE          
#> # ℹ 6 more variables: has_oil <lgl>, has_protein <lgl>, has_dispersal <lgl>,
#> #   has_seed_weights <lgl>, has_storage_behaviour <lgl>, has_morphology <lgl>

# WHERE genus IN (...)
postgrest_qry("species_summary", genus = "in.(Oenanthe,Scorpidium)", limit = 3)
#> # A tibble: 3 × 12
#>   genus    epithet  id    infraspecies_rank infraspecies_epithet has_germination
#>   <chr>    <chr>    <chr> <lgl>             <lgl>                <lgl>          
#> 1 Oenanthe aquatica bb85… NA                NA                   TRUE           
#> 2 Oenanthe crocata  4de9… NA                NA                   TRUE           
#> 3 Oenanthe fistulo… bd7c… NA                NA                   TRUE           
#> # ℹ 6 more variables: has_oil <lgl>, has_protein <lgl>, has_dispersal <lgl>,
#> #   has_seed_weights <lgl>, has_storage_behaviour <lgl>, has_morphology <lgl>

# generated query will be a bit lenghty, so let's just use 3 spieces for an example
Species <- c("Astragalus glycyphyllos", "Epirrita dilutata", "Oenanthe aquatica")

# build a query to get summary for all genus-epithet ilike pairs
q_and <- 
  Species |>
  str_split(" ") |> 
  map_chr(\(x) str_glue("and(genus.ilike.{x[1]}*,epithet.ilike.{x[2]}*)")) |>
  str_c(collapse = ",") 
q_or <- str_glue("({q_and})")

str_c("or=",q_or)
#> [1] "or=(and(genus.ilike.Astragalus*,epithet.ilike.glycyphyllos*),and(genus.ilike.Epirrita*,epithet.ilike.dilutata*),and(genus.ilike.Oenanthe*,epithet.ilike.aquatica*))"
(summary_df <- postgrest_qry("species_summary", or = q_or))
#> # A tibble: 2 × 12
#>   genus     epithet id    infraspecies_rank infraspecies_epithet has_germination
#>   <chr>     <chr>   <chr> <lgl>             <lgl>                <lgl>          
#> 1 Oenanthe  aquati… bb85… NA                NA                   TRUE           
#> 2 Astragal… glycyp… e704… NA                NA                   TRUE           
#> # ℹ 6 more variables: has_oil <lgl>, has_protein <lgl>, has_dispersal <lgl>,
#> #   has_seed_weights <lgl>, has_storage_behaviour <lgl>, has_morphology <lgl>

# check species & seed_weights tables
postgrest_qry("species", limit = 3) |> glimpse()
#> Rows: 3
#> Columns: 16
#> $ id                     <chr> "f625934c-fa30-475a-9189-22463d04698e", "08fe8c…
#> $ int_id                 <dbl> 40, 21078, 21086
#> $ family                 <dbl> 4, 50, 50
#> $ genus                  <chr> "Abies", "Senna", "Senna"
#> $ epithet                <chr> "sachalinensis", "pleurocarpa", "septemtrionali…
#> $ infraspecies_epithet   <chr> "mayriana", NA, NA
#> $ authority              <chr> "(F.Schmidt) Mast.", "(F.Muell.) Randell", "(Vi…
#> $ infraspecies_rank      <chr> "var.", NA, NA
#> $ infraspecies_authority <chr> "Miyabe & Kudô", NA, NA
#> $ synonyms               <chr> "Abies mayriana", "Cassia pleurocarpa F.V.M.", …
#> $ common_name            <lgl> NA, NA, NA
#> $ lifeform               <chr> "phan.", "nanophan.", "nanophan."
#> $ rank3                  <lgl> NA, NA, NA
#> $ sp3                    <lgl> NA, NA, NA
#> $ author3                <lgl> NA, NA, NA
#> $ binomial               <chr> "'abi':1 'sachalinensi':2", "'pleurocarpa':2 's…
postgrest_qry("seed_weights", limit = 3) |> glimpse()
#> Rows: 3
#> Columns: 9
#> $ id                 <dbl> 60755, 60756, 60765
#> $ species_id         <dbl> 13, 15, 20
#> $ thousandseedweight <dbl> 28.6, 34.5, 20.0
#> $ materialweighed_id <dbl> 1, 1, 1
#> $ notes              <lgl> NA, NA, NA
#> $ reference_id       <dbl> 1380, 1380, 1380
#> $ precision          <dbl> 1, 1, 1
#> $ cultivar           <lgl> NA, NA, NA
#> $ serialnum          <lgl> NA, NA, NA

# it's probably safe to assume that seed_weights.species_id is a foregin key to species,
# so we can embed those resources / join tables

# get seed weights for all id-s in summary_df
(id_q <- str_c("in.(", str_c(summary_df$id, collapse = ","), ")"))
#> [1] "in.(bb851d35-2b67-48d1-8b0b-bda2bbd05f42,e7043715-6324-415e-83f8-02d282f7b5f8)"

# id,int_id,genus,epithet from species joined with seed_weights table
(seed_weights_df <- postgrest_qry("species", id = id_q, select = "id,int_id,genus,epithet,seed_weights(*)"))
#> # A tibble: 2 × 5
#>   id                                   int_id genus      epithet    seed_weights
#>   <chr>                                 <dbl> <chr>      <chr>      <chr>       
#> 1 bb851d35-2b67-48d1-8b0b-bda2bbd05f42  16286 Oenanthe   aquatica   "[{\"id\":9…
#> 2 e7043715-6324-415e-83f8-02d282f7b5f8   2649 Astragalus glycyphyl… "[{\"id\":7…

# one-to-many realtionship, PostgREST handles this through JSON objects even though we have asked for CSV;
# parse JSON column (row-wise), unnest
seed_weights_df |>
  rowwise() |>
  mutate(w = jsonlite::parse_json(seed_weights, simplifyVector = TRUE) |> list(), .keep = "unused") |>
  unnest(w, names_sep = ".")
#> # A tibble: 18 × 13
#>    id              int_id genus epithet   w.id w.species_id w.thousandseedweight
#>    <chr>            <dbl> <chr> <chr>    <int>        <int>                <dbl>
#>  1 bb851d35-2b67-…  16286 Oena… aquati…  90114        16286                 2.19
#>  2 bb851d35-2b67-…  16286 Oena… aquati…  90159        16286                 2.41
#>  3 bb851d35-2b67-…  16286 Oena… aquati… 139102        16286                 2.16
#>  4 e7043715-6324-…   2649 Astr… glycyp…  79094         2649                 4.5 
#>  5 e7043715-6324-…   2649 Astr… glycyp…  79099         2649                 7.17
#>  6 e7043715-6324-…   2649 Astr… glycyp…  85515         2649                 4.22
#>  7 e7043715-6324-…   2649 Astr… glycyp…  85639         2649                 4.05
#>  8 e7043715-6324-…   2649 Astr… glycyp…  87647         2649                 6.05
#>  9 e7043715-6324-…   2649 Astr… glycyp…  88668         2649                 5.07
#> 10 e7043715-6324-…   2649 Astr… glycyp…  88909         2649                 5.23
#> 11 e7043715-6324-…   2649 Astr… glycyp…  92388         2649                 3.50
#> 12 e7043715-6324-…   2649 Astr… glycyp… 104280         2649                 3.00
#> 13 e7043715-6324-…   2649 Astr… glycyp… 105741         2649                 3.53
#> 14 e7043715-6324-…   2649 Astr… glycyp… 142103         2649                 5.86
#> 15 e7043715-6324-…   2649 Astr… glycyp…  62507         2649                 4.5 
#> 16 e7043715-6324-…   2649 Astr… glycyp…  62508         2649                 4.54
#> 17 e7043715-6324-…   2649 Astr… glycyp…  77077         2649                 4.87
#> 18 e7043715-6324-…   2649 Astr… glycyp…  84292         2649                 4.66
#> # ℹ 6 more variables: w.materialweighed_id <int>, w.notes <chr>,
#> #   w.reference_id <int>, w.precision <chr>, w.cultivar <lgl>,
#> #   w.serialnum <chr>

^{Created on 2024-05-28 with reprex v2.1.0}