I am trying to scrap this website and extract content from multiple pages. I used a while loop to increase the page number every time but it is not working. All it does is duplicate the same content from page 0, as opposed to getting the content from the other pages (in the example below 3 times).
I'm not sure what I am missing.
library(tidyverse)
library(rvest)
base_url <- "https://npin.cdc.gov/search?=type%3Aorganization&page="
# counter:
e <- 0
result <- data.frame(org_name = as.integer(),
services = as.character(),
item = as.character(),
street = as.character(),
city = as.character(),
state = as.character())
while (e <= 3) {
cat(".")
new_page <-
read_html(
# change the page number with each iteration of the loop
sprintf(
base_url,
e
)
)
services <- new_page %>% html_nodes(".services-fieldset")
org_name <- new_page %>% html_nodes(".block-field-blocknodeorganizationtitle") %>% html_text2()
street <- new_page |> html_nodes(".address-line1") |> html_text2()
city <- new_page |> html_nodes(".locality") |> html_text2()
state <- new_page |> html_nodes(".administrative-area") |> html_text2()
zip <- new_page |> html_nodes(".postal-code") |> html_text2()
new_result <- for (i in 1:length(services)) {
temp <- services[i] %>% html_nodes(".field__items")
for (j in 2:length(temp)) {
label <- temp[j] %>% html_nodes(".field-label") %>% html_text() %>% gsub(":", "", .)
items <- temp[j] %>% html_nodes(".field__item") %>% html_text()
result[nrow(result) +1, ] <- c(org_name[i], label, paste0(items, collapse = ","))
}
}
# append each iteration of the data to the main table:
result <-
rbind(
result,
new_result
)
# increase counter
e <- e + 1
}
result |> tibble()
There's quite a few issues here. You already fixed the non-existing new_york_raw
object, but it still doesn't make sense to grow result
inside the loop, and then bind that to new_result
which is the output of a for
loop, i.e. NULL
.
Probably the biggest issue is that you're creating your URL via sprintf(base_url, e)
which doesn't do what you think it does -- you probably want paste0(base_url, e)
or similar.
A combined fix could look like the following. I replaced the loops by do.call
and *apply
which makes not too much difference for performance but is a more natural way in R to bind data together.
library(tidyverse)
library(rvest)
base_url <- "https://npin.cdc.gov/search?=type%3Aorganization&page="
all_e <- 0:3
result <- do.call(rbind.data.frame, lapply(all_e, \(e) {
new_page <- read_html(paste0(base_url, e)) ## Use correct URL
services <- html_nodes(new_page, ".services-fieldset")
orgs <- html_nodes(new_page, ".block-field-blocknodeorganizationtitle") |> html_text2()
streets <- html_nodes(new_page, ".address-line1") |> html_text2()
cities <- html_nodes(new_page, ".locality") |> html_text2()
states <- html_nodes(new_page, ".administrative-area") |> html_text2()
zips <- html_nodes(new_page, ".postal-code") |> html_text2()
do.call(rbind, lapply(seq_along(services), \(i) {
tmp <- html_nodes(services[i], ".field__items")
cbind(
data.frame(
Org=orgs[i],
Street=streets[i],
City=cities[i],
State=states[i],
Zip=zips[i]
),
do.call(rbind, lapply(tmp[-1], \(t) {
label <- html_nodes(t, ".field-label") |> html_text() |> gsub(":", "", x=_)
items <- html_nodes(t, ".field__item") |> html_text()
c("Label"=label, "Items"=paste0(items, collapse = ","))
}))
)
}))
}))
result |> tibble()
As a minor pet peeve I have no idea why one would write x <- y |> f()
rather than just x <- f(y)
?