Search code examples
rweb-scrapingrvest

Trouble Scraping Dynamic Content from SER-SID.org with rvest in R


I’m having trouble scraping data from https://ser-sid.org/, a database for seed traits. I've successfully retrieved a table of potential attributes of species along with their URLs using the following code:

library(jsonlite)
library(curl)
library(httr)
library(dplyr)

# Define the API endpoint URL
url <- "https://fyxheguykvewpdeysvoh.supabase.co/rest/v1/species_summary"

# Define the API key
api_key <- "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImZ5eGhlZ3V5a3BkZXlzdm9oIiwicm9sZSI6ImFub24iLCJpYXQiOjE2NDc0MTY1MzQsImV4cCI6MTk2Mjk5MjUzNH0.XhJKVijhMUidqeTbH62zQ6r8cS6j22TYAKfbbRHMTZ8"

# Define the species list
Species <- c("Astragalus glycyphyllos", "Epirrita dilutata", "Ilybius chalconatus", 
"Myrmica hirsuta", "Oenanthe aquatica", "Pterapherapteryx sexalata", 
"Rhantus frontalis", "Scorpidium cossonii", "Surnia ulula", "Taraxacum tortilobum"
)

restDF <- function(content) { 
  # Parse the JSON content
  json_data <- jsonlite::fromJSON(content)
  
  # Convert JSON to data frame
  df <- as.data.frame(json_data)
  
  # Return the data frame
  return(df)
}

# Initialize a list to store results
results <- list()

# Loop through each species
for (i in 1:length(Species)) {
  # Split species into genus and epithet
  genus_epithet <- strsplit(Species[i], " ")[[1]]
  genus <- genus_epithet[1]
  epithet <- genus_epithet[2]
  
  # Define the query parameters for each species
  params <- list(
    select = "*",
    genus = paste0("ilike.", genus, "%"),
    epithet = paste0("ilike.", epithet, "%"),
    apikey = api_key  # Include the API key as a query parameter
  )
  
  # Build the full URL with query parameters
  full_url <- modify_url(url, query = params)
  
  # Make the GET request using curl_fetch_memory()
  response <- curl::curl_fetch_memory(full_url)
  
  # Check if the request was successful
  if (response$status_code == 200) {
    # Parse the JSON response
    json_content <- rawToChar(response$content)
    
    # Convert JSON content to data frame using restDF function
    df <- restDF(json_content)
    # Append data to results list
    results[[Species[i]]] <- df
  } else {
    print(paste("Error: Failed to retrieve data for", species))
  }
}

Results <- results |> 
  purrr::reduce(bind_rows) |> 
  dplyr::mutate(URL = paste0("https://ser-sid.org/species/", id))

This produces the following data frame:

genus epithet id has_germination has_oil has_protein has_dispersal has_seed_weights has_storage_behaviour has_morphology URL
Astragalus glycyphyllos e7043715-6324-415e-83f8-02d282f7b5f8 TRUE TRUE TRUE FALSE TRUE TRUE FALSE https://ser-sid.org/species/e7043715-6324-415e-83f8-02d282f7b5f8
Oenanthe aquatica bb851d35-2b67-48d1-8b0b-bda2bbd05f42 TRUE FALSE FALSE FALSE TRUE TRUE FALSE https://ser-sid.org/species/bb851d35-2b67-48d1-8b0b-bda2bbd05f42

Now, I'm trying to get the detailed trait values from the URLs. For example, for seed weight, the data is styled with the CSS class .text-white. However, when I attempt to scrape this data using rvest, I get an empty string.

Here is the code I’m using

library(rvest)

URL <- Results$URL[1]

Data <- read_html(URL) |> 
  rvest::html_elements(".font-medium.text-white") |> 
  rvest::html_text()

# Or

Data <- read_html(URL) |> 
  rvest::html_elements(".tracking-tight, .font-medium.text-white") |> 
  rvest::html_text()

Both approaches return an empty string. I’ve tried various CSS selectors with html_elements but to no avail. Could anyone help me figure out what I'm doing wrong or suggest a better way to scrape the required data?

Thank you in advance for your help!

Edit based on @margusl coment

I tried to do it this way

# Function to parse JSON content to a data frame
restDF <- function(content) { 
  json_data <- jsonlite::fromJSON(content)
  df <- as.data.frame(json_data)
  return(df)
}

# Function to fetch species details
fetch_species_details <- function(species_id) {
  # Define the query parameters for species details
  params <- list(
    select = "*,family(name),germination(percent_germ,presow_treatment,temperature,light_hours,days,medium,provenance,sample_size,reference_id),dispersal(notes,method,animal_group,animal_species,dispersal_agents),seed_weights(thousandseedweight,notes,material_weighed)",
#    "https://fyxheguykvewpdeysvoh.supabase.co/rest/v1/species?select=*%2Cfamily%28name%29%2Cgermination%28percent_germ%2Cpresow_treatment%2Ctemperature%2Clight_hours%2Cdays%2Cmedium%2Cprovenance%2Csample_size%2Creference_id%29%2Cdispersal%28notes%2Cmethod%2Canimal_group%2Canimal_species%2Cdispersal_agents%28description%29%2Creference_id%29%2Cstorage_behaviour%28*%29%2Cseed_weights%28thousandseedweight%2Cnotes%2Cmaterial_weighed%28description%29%2Creference_id%29%2Cprotein_content%28protein_content%2Cmoisture_status%2Cnotes%2Cmaterial_weighed%28description%29%2Creference_id%29%2Coil_content%28oil_content%2Cmoisture_status%2Cnotes%2Cmaterial_weighed%28description%29%2Creference_id%29%2Cmorphology%28*%29&id=eq.e7043715-6324-415e-83f8-02d282f7b5f8"
    id = paste0("eq.", species_id),
    apikey = api_key
  )
  
  # Build the full URL with query parameters
  full_url <- modify_url(details_url, query = params)
  
  # Make the GET request using curl_fetch_memory()
  response <- curl::curl_fetch_memory(full_url)
  
  # Check if the request was successful
  if (response$status_code == 200) {
    # Parse the JSON response
    json_content <- rawToChar(response$content)
    
    # Convert JSON content to data frame using restDF function
    df <- restDF(json_content)
    return(df)
  } else {
    print(paste("Error: Failed to retrieve data for", species_id))
    return(NULL)
  }
}

# Initialize a list to store summary results
summary_results <- list()

# Fetch summary data for each species
for (species in Species) {
  # Split species into genus and epithet
  genus_epithet <- strsplit(species, " ")[[1]]
  genus <- genus_epithet[1]
  epithet <- genus_epithet[2]
  
  # Define the query parameters for species summary
  params <- list(
    select = "*",
    genus = paste0("ilike.", genus, "%"),
    epithet = paste0("ilike.", epithet, "%"),
    apikey = api_key
  )
  
  # Build the full URL with query parameters
  full_url <- modify_url(summary_url, query = params)
  
  # Make the GET request using curl_fetch_memory()
  response <- curl::curl_fetch_memory(full_url)
  
  # Check if the request was successful
  if (response$status_code == 200) {
    # Parse the JSON response
    json_content <- rawToChar(response$content)
    
    # Convert JSON content to data frame using restDF function
    df <- restDF(json_content)
    # Append data to summary results list
    summary_results[[species]] <- df
  } else {
    print(paste("Error: Failed to retrieve data for", species))
  }
}

# Combine summary results into a single data frame
summary_df <- summary_results |> 
  purrr::reduce(bind_rows) |> 
  dplyr::mutate(URL = paste0("https://ser-sid.org/species/", id))

# Initialize a list to store detailed results
detailed_results <- list()

# Fetch detailed data for each species
for (i in 1:nrow(summary_df)) {
  species_id <- summary_df$id[i]
  details_df <- fetch_species_details(species_id)
  
  if (!is.null(details_df)) {
    detailed_results[[species_id]] <- details_df
  }
}

# Combine detailed results into a single data frame
detailed_df <- detailed_results |> 
  purrr::reduce(bind_rows)

# Print the detailed data frame
print(detailed_df)

and it mostly works, however 2 issues

  1. Weight does not appear
  2. Data frame has variables that are a list

Any help on finishing this last 2 details would be great


Solution

  • As the database is hosted at Supabase, you can check their API doc and PostgREST API. You probably still need to figure out table names (API endpoints, i.e. /species_summary , /species, /seed_weights, etc) and relations from requests and/or minified JavaScript sources, but you can still build your own queries, use all PostgREST operators and don't really need to loop though through genus-epithet pairs or IDs. PostgREST also allows you to request results in flat CSV instead of JSON (well, kind of -- some fields in DB might be in JSON and when joining with to-many relationships, PostgREST also returns JSON objects; so one might still need to parse some pieces and handle nested data)

    Regarding lists and nested frames, I'm afraid you need to come up with a requirement of your own, others can then help with implementation. Just flattening or unnesting everything to long and/or wide will likely cause your frame dimensions to explode. Perhaps split that retrieved dataset into multiple frames. Or adjust queries so you'd get data from a specific table.

    As an example, getting summaries and seed details for a list of species with just 2 requests might look something like this:

    library(httr2)
    library(dplyr)
    library(tidyr)
    library(readr)
    library(purrr)
    library(stringr)
    
    # httr2 request object to store root node URL and API key
    postgrest_req <- 
      request("https://fyxheguykvewpdeysvoh.supabase.co/rest/v1/") |>
      req_headers(apikey = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImZ5eGhlZ3V5a3Zld3BkZXlzdm9oIiwicm9sZSI6ImFub24iLCJpYXQiOjE2NDc0MTY1MzQsImV4cCI6MTk2Mjk5MjUzNH0.XhJKVijhMUidqeTbH62zQ6r8cS6j22TYAKfbbRHMTZ8")
    
    # request helper, returns tibble
    postgrest_qry <- function(tbl, ..., req = postgrest_req){
      req |>
        req_url_path_append(tbl) |>
        req_url_query(...) |>
        # request for CSV instead of JSON
        req_headers(Accept = "text/csv") |>
        req_perform() |>
        resp_body_string() |>
        read_csv(show_col_types = FALSE)
    }
    # test
    # 3 rows from species_summary:
    postgrest_qry("species_summary", limit = 3)
    #> # A tibble: 3 × 12
    #>   genus epithet     id    infraspecies_rank infraspecies_epithet has_germination
    #>   <chr> <chr>       <chr> <chr>             <chr>                <lgl>          
    #> 1 Abies sachalinen… f625… var.              mayriana             FALSE          
    #> 2 Senna pleurocarpa 08fe… <NA>              <NA>                 FALSE          
    #> 3 Senna septemtrio… 457a… <NA>              <NA>                 TRUE           
    #> # ℹ 6 more variables: has_oil <lgl>, has_protein <lgl>, has_dispersal <lgl>,
    #> #   has_seed_weights <lgl>, has_storage_behaviour <lgl>, has_morphology <lgl>
    
    # WHERE genus ILIKE '...%'
    postgrest_qry("species_summary", genus = "ilike.astra*", limit = 3)
    #> # A tibble: 3 × 12
    #>   genus     epithet id    infraspecies_rank infraspecies_epithet has_germination
    #>   <chr>     <chr>   <chr> <chr>             <chr>                <lgl>          
    #> 1 Astragal… nervul… b06d… <NA>              <NA>                 FALSE          
    #> 2 Astragal… steven… 10ec… subsp.            meskheticus          FALSE          
    #> 3 Astragal… subuli… f669… <NA>              <NA>                 FALSE          
    #> # ℹ 6 more variables: has_oil <lgl>, has_protein <lgl>, has_dispersal <lgl>,
    #> #   has_seed_weights <lgl>, has_storage_behaviour <lgl>, has_morphology <lgl>
    
    # WHERE genus IN (...)
    postgrest_qry("species_summary", genus = "in.(Oenanthe,Scorpidium)", limit = 3)
    #> # A tibble: 3 × 12
    #>   genus    epithet  id    infraspecies_rank infraspecies_epithet has_germination
    #>   <chr>    <chr>    <chr> <lgl>             <lgl>                <lgl>          
    #> 1 Oenanthe aquatica bb85… NA                NA                   TRUE           
    #> 2 Oenanthe crocata  4de9… NA                NA                   TRUE           
    #> 3 Oenanthe fistulo… bd7c… NA                NA                   TRUE           
    #> # ℹ 6 more variables: has_oil <lgl>, has_protein <lgl>, has_dispersal <lgl>,
    #> #   has_seed_weights <lgl>, has_storage_behaviour <lgl>, has_morphology <lgl>
    
    # generated query will be a bit lenghty, so let's just use 3 spieces for an example
    Species <- c("Astragalus glycyphyllos", "Epirrita dilutata", "Oenanthe aquatica")
    
    # build a query to get summary for all genus-epithet ilike pairs
    q_and <- 
      Species |>
      str_split(" ") |> 
      map_chr(\(x) str_glue("and(genus.ilike.{x[1]}*,epithet.ilike.{x[2]}*)")) |>
      str_c(collapse = ",") 
    q_or <- str_glue("({q_and})")
    
    str_c("or=",q_or)
    #> [1] "or=(and(genus.ilike.Astragalus*,epithet.ilike.glycyphyllos*),and(genus.ilike.Epirrita*,epithet.ilike.dilutata*),and(genus.ilike.Oenanthe*,epithet.ilike.aquatica*))"
    (summary_df <- postgrest_qry("species_summary", or = q_or))
    #> # A tibble: 2 × 12
    #>   genus     epithet id    infraspecies_rank infraspecies_epithet has_germination
    #>   <chr>     <chr>   <chr> <lgl>             <lgl>                <lgl>          
    #> 1 Oenanthe  aquati… bb85… NA                NA                   TRUE           
    #> 2 Astragal… glycyp… e704… NA                NA                   TRUE           
    #> # ℹ 6 more variables: has_oil <lgl>, has_protein <lgl>, has_dispersal <lgl>,
    #> #   has_seed_weights <lgl>, has_storage_behaviour <lgl>, has_morphology <lgl>
    
    # check species & seed_weights tables
    postgrest_qry("species", limit = 3) |> glimpse()
    #> Rows: 3
    #> Columns: 16
    #> $ id                     <chr> "f625934c-fa30-475a-9189-22463d04698e", "08fe8c…
    #> $ int_id                 <dbl> 40, 21078, 21086
    #> $ family                 <dbl> 4, 50, 50
    #> $ genus                  <chr> "Abies", "Senna", "Senna"
    #> $ epithet                <chr> "sachalinensis", "pleurocarpa", "septemtrionali…
    #> $ infraspecies_epithet   <chr> "mayriana", NA, NA
    #> $ authority              <chr> "(F.Schmidt) Mast.", "(F.Muell.) Randell", "(Vi…
    #> $ infraspecies_rank      <chr> "var.", NA, NA
    #> $ infraspecies_authority <chr> "Miyabe & Kudô", NA, NA
    #> $ synonyms               <chr> "Abies mayriana", "Cassia pleurocarpa F.V.M.", …
    #> $ common_name            <lgl> NA, NA, NA
    #> $ lifeform               <chr> "phan.", "nanophan.", "nanophan."
    #> $ rank3                  <lgl> NA, NA, NA
    #> $ sp3                    <lgl> NA, NA, NA
    #> $ author3                <lgl> NA, NA, NA
    #> $ binomial               <chr> "'abi':1 'sachalinensi':2", "'pleurocarpa':2 's…
    postgrest_qry("seed_weights", limit = 3) |> glimpse()
    #> Rows: 3
    #> Columns: 9
    #> $ id                 <dbl> 60755, 60756, 60765
    #> $ species_id         <dbl> 13, 15, 20
    #> $ thousandseedweight <dbl> 28.6, 34.5, 20.0
    #> $ materialweighed_id <dbl> 1, 1, 1
    #> $ notes              <lgl> NA, NA, NA
    #> $ reference_id       <dbl> 1380, 1380, 1380
    #> $ precision          <dbl> 1, 1, 1
    #> $ cultivar           <lgl> NA, NA, NA
    #> $ serialnum          <lgl> NA, NA, NA
    
    # it's probably safe to assume that seed_weights.species_id is a foregin key to species,
    # so we can embed those resources / join tables
    
    # get seed weights for all id-s in summary_df
    (id_q <- str_c("in.(", str_c(summary_df$id, collapse = ","), ")"))
    #> [1] "in.(bb851d35-2b67-48d1-8b0b-bda2bbd05f42,e7043715-6324-415e-83f8-02d282f7b5f8)"
    
    # id,int_id,genus,epithet from species joined with seed_weights table
    (seed_weights_df <- postgrest_qry("species", id = id_q, select = "id,int_id,genus,epithet,seed_weights(*)"))
    #> # A tibble: 2 × 5
    #>   id                                   int_id genus      epithet    seed_weights
    #>   <chr>                                 <dbl> <chr>      <chr>      <chr>       
    #> 1 bb851d35-2b67-48d1-8b0b-bda2bbd05f42  16286 Oenanthe   aquatica   "[{\"id\":9…
    #> 2 e7043715-6324-415e-83f8-02d282f7b5f8   2649 Astragalus glycyphyl… "[{\"id\":7…
    
    # one-to-many realtionship, PostgREST handles this through JSON objects even though we have asked for CSV;
    # parse JSON column (row-wise), unnest
    seed_weights_df |>
      rowwise() |>
      mutate(w = jsonlite::parse_json(seed_weights, simplifyVector = TRUE) |> list(), .keep = "unused") |>
      unnest(w, names_sep = ".")
    #> # A tibble: 18 × 13
    #>    id              int_id genus epithet   w.id w.species_id w.thousandseedweight
    #>    <chr>            <dbl> <chr> <chr>    <int>        <int>                <dbl>
    #>  1 bb851d35-2b67-…  16286 Oena… aquati…  90114        16286                 2.19
    #>  2 bb851d35-2b67-…  16286 Oena… aquati…  90159        16286                 2.41
    #>  3 bb851d35-2b67-…  16286 Oena… aquati… 139102        16286                 2.16
    #>  4 e7043715-6324-…   2649 Astr… glycyp…  79094         2649                 4.5 
    #>  5 e7043715-6324-…   2649 Astr… glycyp…  79099         2649                 7.17
    #>  6 e7043715-6324-…   2649 Astr… glycyp…  85515         2649                 4.22
    #>  7 e7043715-6324-…   2649 Astr… glycyp…  85639         2649                 4.05
    #>  8 e7043715-6324-…   2649 Astr… glycyp…  87647         2649                 6.05
    #>  9 e7043715-6324-…   2649 Astr… glycyp…  88668         2649                 5.07
    #> 10 e7043715-6324-…   2649 Astr… glycyp…  88909         2649                 5.23
    #> 11 e7043715-6324-…   2649 Astr… glycyp…  92388         2649                 3.50
    #> 12 e7043715-6324-…   2649 Astr… glycyp… 104280         2649                 3.00
    #> 13 e7043715-6324-…   2649 Astr… glycyp… 105741         2649                 3.53
    #> 14 e7043715-6324-…   2649 Astr… glycyp… 142103         2649                 5.86
    #> 15 e7043715-6324-…   2649 Astr… glycyp…  62507         2649                 4.5 
    #> 16 e7043715-6324-…   2649 Astr… glycyp…  62508         2649                 4.54
    #> 17 e7043715-6324-…   2649 Astr… glycyp…  77077         2649                 4.87
    #> 18 e7043715-6324-…   2649 Astr… glycyp…  84292         2649                 4.66
    #> # ℹ 6 more variables: w.materialweighed_id <int>, w.notes <chr>,
    #> #   w.reference_id <int>, w.precision <chr>, w.cultivar <lgl>,
    #> #   w.serialnum <chr>
    

    Created on 2024-05-28 with reprex v2.1.0