Web scraping of the disciplinary assignment of university courses

I would like to scrape a university-course catalog with R. My code is already quite good, but the assignment of courses to disciplines and subdisciplines does not yet work the way I want it to.

This is my code:

# loading needed libraries -----------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, rvest, xml2)
html_code <- read_html(https://www.uni-bremen.de/studium/starten-studieren/veranstaltungsverzeichnis?tx_hbulvp_pi1%5Bmodule%5D=93fdb6be384979f7300d263ba0c094be&tx_hbulvp_pi1%5Bsem%5D=39)
# Eine rekursive Funktion, um Informationen unter jedem h-Tags zu sammeln
extract_module_info <- function(node, module_path = c()) {
  # Basisfall: Wenn der Knoten leer ist, beenden Sie die Rekursion
  if (length(node) == 0) return(tibble())
  current_tag <- node %>% html_name()
  current_text <- node %>% html_text(trim = TRUE)
  # Aktualisieren des Pfades mit dem aktuellen Modul/Submodul
  new_path <- c(module_path, current_text)
  # Suchen nach dem nächsten div, das die Details enthält
  details_node <- node %>% html_node(xpath = "./following-sibling::div[1]")
  # Sammeln von Detailinformationen, wenn vorhanden
  if (!is.null(details_node) && length(html_nodes(details_node, 'tr')) > 0) {
    details <- html_nodes(details_node, 'tr') %>%
      map_df(~{
        tibble(
          ModulePath = list(new_path),
          CourseDesc = html_nodes(.x, '.expander') %>%
            html_text(trim = TRUE) %>%
            ifelse(length(.) == 0, NA_character_, .),
          CourseElse = html_nodes(.x, 'td:nth-child(2)') %>%
            html_text(trim = TRUE) %>%
            ifelse(length(.) == 0, NA_character_, .),
          CourseVAK = html_nodes(.x, 'td:nth-child(1)') %>%
            html_text(trim = TRUE) %>%
            ifelse(length(.) == 0, NA_character_, .),
          CourseTitle = html_nodes(.x, 'strong') %>%
            html_text(trim = TRUE) %>%
            ifelse(length(.) == 0, NA_character_, .),
          CourseTeacher = html_nodes(.x, 'td ~ td + td') %>%
            html_text(trim = TRUE) %>%
            ifelse(length(.) == 0, NA_character_, .)
        )
      })
  } else {
    details <- tibble(ModulePath = list(new_path))
  }
  # Rekursiver Abstieg zum nächsten h-Tag, falls vorhanden
  next_node <- node %>% html_node(xpath = "./following-sibling::*[self::h2 or self::h3 or self::h4][1]")
  child_details <- extract_module_info(next_node, new_path)
  # Kombinieren der aktuellen Details mit den rekursiv gesammelten Details
  bind_rows(details, child_details)
}
# Anwendung der Funktion auf das gesamte Dokument, startend mit dem ersten h2-Tag
results <- html_nodes(html_code, 'h2') %>% map_df(~extract_module_info(.x))
# Ausgabe der Ergebnisse
print(results)

The problem arises on the variable ModulePath. Let's take a look at the course offering with the VAK ID SZHB 0806 as an example:

> results |> slice(544) |> select(CourseVAK )
# A tibble: 1 × 1
  CourseVAK
  <chr>
1 SZHB 0806

And there the value of ModulePath:


 > results |> slice(544) |> select(ModulePath) |> pull()
[[1]]
 [1] "Language Center of the Universities in the State of Bremen"
 [2] "Arabic"
 [3] "Chinese"
 [4] "German"
 [5] "German sign language"
 [6] "English"
 [7] "French"
 [8] "Hebrew (modern)"
 [9] "Italian"
[10] "Japanese"
[11] "Catalan"
[12] "Korean"
[13] "Croatian"
[14] "Kurdish"
[15] "Latin"
[16] "Dutch"
[17] "Polish"

Unfortunately, all h4 headers are scraped up to the respective course offer.

What I would like to have is a result like this:

[[1]]
 [1] "Language Center of the Universities in the State of Bremen"
 [2] "Polish"

Of course, you could now remove all elements except the first and the last. But is that a reliable way given the fact that I don't know the structures of other pages? Maybe the headers will be more nested in the future.

Solution

cool code and nice project. I know the pain, when html-code is not properly nested. Makes things always a bit harder.

I kind of rewrote the logic of your code, because it seemed easier to me, to take advantage of the implicit sorting of the different styles of headers. If you take advatnage of keeping them that way you can just copy down the values from previous nodes wiht the nice fill function from the tidyverse.

Please double-check, that all still extracs the values as expected.

EDIT: I have added a modification that also scrapes the occasional "p strong" tag as a header. It becomes more convoluted of course, just did not have the time to think of something prettier, but I think it works.

# loading needed libraries -----------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, rvest, xml2)
html_code_1 <- read_html("https://www.uni-bremen.de/studium/starten-studieren/veranstaltungsverzeichnis?tx_hbulvp_pi1%5Bmodule%5D=93fdb6be384979f7300d263ba0c094be&tx_hbulvp_pi1%5Bsem%5D=39")
html_code_2 <- read_html("https://www.uni-bremen.de/studium/starten-studieren/veranstaltungsverzeichnis?tx_hbulvp_pi1%5Bmodule%5D=54c6fd5c0b74c8c6b7f81ab2939a7196&tx_hbulvp_pi1%5Bsem%5D=40")

result <- html_elements(html_code_2, '.tx-hbulvp-pi1-module') |>
  ## map over the different modules
  map_dfr(function(main_module) {
    ## extract the children of these, which are studium generale, etc.
    html_children(main_module) |>
      map_dfr(function(headers) {
        if (as.character(headers) |> str_detect("h2")) {
          return(tibble(h2 = html_text(headers)))
        } else if (as.character(headers) |> str_detect("h3")) {
          return(tibble(h3 = html_text(headers)))
        } else if (as.character(headers) |> str_detect("h4")) {
          return(tibble(h4 = html_text(headers)))
        } else if (as.character(headers) |> str_detect("<p")) {
          return(tibble(p_strong = html_text(headers)))
        } else if (as.character(headers) |> str_detect("div")) {
          content <- map_dfr(headers |> html_elements("tr"),
                             function(tr) {
                               tibble(
                                 CourseDesc = html_nodes(tr, '.expander') %>%
                                   html_text(trim = TRUE) %>%
                                   ifelse(length(.) == 0, NA_character_, .),
                                 CourseElse = html_nodes(tr, 'td:nth-child(2)') %>%
                                   html_text(trim = TRUE) %>%
                                   ifelse(length(.) == 0, NA_character_, .),
                                 CourseVAK = html_nodes(tr, 'td:nth-child(1)') %>%
                                   html_text(trim = TRUE) %>%
                                   ifelse(length(.) == 0, NA_character_, .),
                                 CourseTitle = html_nodes(tr, 'strong') %>%
                                   html_text(trim = TRUE) %>%
                                   ifelse(length(.) == 0, NA_character_, .),
                                 CourseTeacher = html_nodes(tr, 'td ~ td + td') %>%
                                   html_text(trim = TRUE) %>%
                                   ifelse(length(.) == 0, NA_character_, .)
                               )
                               
                             })
          return(content)
        }
      })
  }) |>
  fill(h2, .direction = "down") |>
  group_by(h2) |>
  fill(h3, .direction = "down") |>
  group_by(h3,h2) |>
  fill(h4, .direction = "down")|>
  group_by(h2,h3,h4)

if("p_strong" %in% names(result)){
  result <- result |> 
    fill(p_strong, .direction = "down") |>
    filter(!is.na(CourseDesc)) |>
    select(h2, h3, h4,p_strong, everything())
} else {
  result <- result |> 
    filter(!is.na(CourseDesc)) |>
    select(h2, h3, h4, everything())
}

EDIT 2: This is becoming a bit of a Frankenstein-Monster. Additional changes to implement missing headers as well. Now in a more concise function logic, but with a lot of overhead and quite slow :S

# loading needed libraries -----------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, rvest, xml2)
html_code_1 <- read_html("https://www.uni-bremen.de/studium/starten-studieren/veranstaltungsverzeichnis?tx_hbulvp_pi1%5Bmodule%5D=93fdb6be384979f7300d263ba0c094be&tx_hbulvp_pi1%5Bsem%5D=39")
html_code_2 <- read_html("https://www.uni-bremen.de/studium/starten-studieren/veranstaltungsverzeichnis?tx_hbulvp_pi1%5Bmodule%5D=54c6fd5c0b74c8c6b7f81ab2939a7196&tx_hbulvp_pi1%5Bsem%5D=40")
html_code_3 <- read_html("https://www.uni-bremen.de/studium/starten-studieren/veranstaltungsverzeichnis?tx_hbulvp_pi1%5Bmodule%5D=2332b6fb86a3cb7c42b4d6bb08ca3ef3&tx_hbulvp_pi1%5Bsem%5D=40")


fill_down <- function(result) {

  # Fill h2 if it exists
  if ("h2" %in% names(result)) {
    result <- result |> fill(h2, .direction = "down")
  }
  
  # Fill h3 if it exists and is grouped by h2
  if ("h3" %in% names(result)) {
    result <- result |> group_by(h2) |> fill(h3, .direction = "down") |> ungroup()
  }
  
  # Fill h4 if it exists and is grouped by both h2 and h3
  if ("h4" %in% names(result)) {
    result <- result |> group_by(h2, h3) |> fill(h4, .direction = "down") |> ungroup()
  }
  
  # Fill p_strong if it exists, grouped by h2, h3, and h4
  if ("p_strong" %in% names(result)) {
    result <- result |> group_by(h2, h3, h4) |> fill(p_strong, .direction = "down") |> ungroup()
  }
  
  # After filling, filter entries where 'CourseDesc' is not NA and reset grouping
  result <- result |> 
    filter(!is.na(CourseTitle)) |> 
    select(any_of(c("h2", "h3", "h4", "p_strong")), everything())
  
  return(result)
}

parse_html <- function(html){
  parsed_result <- html_elements(html_code_2, '.tx-hbulvp-pi1-module') |>
    ## map over the different modules
    map_dfr(function(main_module) {
      ## extract the children of these, which are studium generale, etc.
      html_children(main_module) |>
        map_dfr(function(headers) {
          if (as.character(headers) |> str_detect("h2")) {
            return(tibble(h2 = html_text(headers)))
          } else if (as.character(headers) |> str_detect("h3")) {
            return(tibble(h3 = html_text(headers)))
          } else if (as.character(headers) |> str_detect("h4")) {
            return(tibble(h4 = html_text(headers)))
          } else if (as.character(headers) |> str_detect("<p")) {
            return(tibble(p_strong = html_text(headers)))
          } else if (as.character(headers) |> str_detect("div")) {
            content <- map_dfr(headers |> html_elements("tr"),
                               function(tr) {
                                 tibble(
                                   CourseDesc = html_nodes(tr, '.expander') %>%
                                     html_text(trim = TRUE) %>%
                                     ifelse(length(.) == 0, NA_character_, .),
                                   CourseElse = html_nodes(tr, 'td:nth-child(2)') %>%
                                     html_text(trim = TRUE) %>%
                                     ifelse(length(.) == 0, NA_character_, .),
                                   CourseVAK = html_nodes(tr, 'td:nth-child(1)') %>%
                                     html_text(trim = TRUE) %>%
                                     ifelse(length(.) == 0, NA_character_, .),
                                   CourseTitle = html_nodes(tr, 'strong') %>%
                                     html_text(trim = TRUE) %>%
                                     ifelse(length(.) == 0, NA_character_, .),
                                   CourseTeacher = html_nodes(tr, 'td ~ td + td') %>%
                                     html_text(trim = TRUE) %>%
                                     ifelse(length(.) == 0, NA_character_, .)
                                 )
                                 
                               })
            return(content)
          }
        })
    }) |> fill_down()
  return(parsed_result)
}

results <- map_dfr(list(html_code_1, html_code_2, html_code_3), parse_html)