Scraping images from a website into subfolders

I am trying to scrape images from this website: hhttps://moweek.com.uy/.

There are different subheaders: "VESTIMENTA", "CALZADO", "ACCESORIOS", "BEAUTY", "MARCAS", "AGENDA" & "BLOG". I want to "click" in "VESTIMENTA" and click into each subheader ("Activewear", "Blazers y chaquetas") as they look in the following picture but I am having trouble with a code that worked in the past:

My main goal is to download the images in each subheader (category) into a folder called "images" in a sub-folder with the category name (activewear, "blazer_y_chaquetas", etc). In the code, I want to download information from each picture, but now I have that separated in another script.

This is what I have so far:

library(tidyverse)
library(rvest)
library(httr)

url <- "https://moweek.com.uy/"
html_content <- GET(url)

webpage <- read_html(content(html_content, as = "text"))

category_nodes <- html_nodes(webpage, "a.headerOption.headerLink[data-category-id='1']")
category_urls <- html_attr(category_nodes, "href")

dir.create("images", showWarnings = FALSE)

# Loop through subcategories and download images
for (category_url in category_urls) {
  # Remove any extra slashes in the URL construction
  subcategory_url <- paste0(url, gsub("/+", "/", category_url))
  
  subcategory_page <- tryCatch({
    read_html(subcategory_url)
  }, error = function(e) {
    message(paste("Error accessing URL:", subcategory_url))
    return(NULL)
  })
  
  if (is.null(subcategory_page)) {
    next  # Skip to the next subcategory if an error occurred
  }
  
  subcategory_name <- subcategory_page %>% html_text()
  
  # Clean the subcategory name for folder creation
  subfolder_name <- gsub(" ", "_", tolower(subcategory_name))
  subfolder_path <- file.path("images", subfolder_name)
  dir.create(subfolder_path, showWarnings = FALSE)
  
  # Extract image URLs and download them into the subfolder
  image_urls <- subcategory_page %>% 
    html_nodes(".your-image-selector") %>%
    html_attr("src")
  
  for (image_url in image_urls) {
    image_name <- basename(image_url)
    download.file(image_url, file.path(subfolder_path, image_name))
  }
}

Solution

This worked perfectly!

pacman::p_load(tidyverse, rvest, httr)

url <- "https://moweek.com.uy"
webpage <- httr::GET(url)

webpage <- read_html(content(webpage, as = "text"))

category_nodes <- html_nodes(webpage, ".expandedCategory")

category_urls <- lapply(category_nodes, function(node) html_nodes(node, "a") %>%
                          html_attr("href")) %>%
  unlist() %>%
  str_subset("vestimenta") %>%
  str_replace_all("1", "30")

category_urls <- category_urls[!(category_urls %in% c("/vestimenta/30",          "/vestimenta/activewear/30",
                                                      "/vestimenta/bodywear/30", "/vestimenta/leggings-y-bikers/30",
                                                      "/vestimenta/lenceria/30", "/vestimenta/hombre/30",
                                                      "/vestimenta/ninos/30",    "/vestimenta/pijamas-y-camisones/30",
                                                      "https://moweek.com.uy/vestimenta/vestidos/30",
                                                      "https://moweek.com.uy/vestimenta/trajes-de-bano/30"))]

dir.create("images", showWarnings = FALSE)

# Function to limit folder name length
limit_folder_name_length <- function(category_url, max_length = 50) {
  # Extract the subcategory name from the category_url
  subcategory_name <- gsub(".*/vestimenta/(.*?)/30", "\\1", category_url)
  
  if (nchar(subcategory_name) > max_length) {
    subcategory_name <- substr(subcategory_name, 1, max_length)
  }
  return(subcategory_name)
}

# Loop through subcategories and download images
for (category_url in category_urls) {
  
  subcategory_name <- gsub(".*/vestimenta/(.*?)/30", "\\1", category_url)
  subfolder_name <- limit_folder_name_length(gsub(" ", "_", tolower(subcategory_name)))
  subfolder_path <- file.path("images", subfolder_name)
  dir.create(subfolder_path, showWarnings = FALSE, recursive = TRUE)
  
  subcategory_url <- paste0(url, gsub("/+", "/", category_url))
  
  # Sleep for a few seconds to avoid overloading the server
  Sys.sleep(3)
  
  cat_content <- RETRY("GET", subcategory_url)
  cat_page <- read_html(content(cat_content, as = "text"))
  
  image_urls <- cat_page %>%
    html_nodes(".productViewTopImage") %>%
    html_attr("src")
  
  image_urls <- ifelse(substr(image_urls, 1, 8) != "https://", paste0(url, image_urls), image_urls)
  
  for (image_url in image_urls) {
    if (!is.na(image_url) && image_url != "") {
     
       image_name <- basename(image_url)
      image_path <- file.path(subfolder_path, image_name)
      
      # Download and save the image to the subfolder
      download.file(image_url, image_path, mode = "wb")
      
    }
  }
}