I am trying to scrape images from this website: hhttps://moweek.com.uy/.
There are different subheaders: "VESTIMENTA", "CALZADO", "ACCESORIOS", "BEAUTY", "MARCAS", "AGENDA" & "BLOG". I want to "click" in "VESTIMENTA" and click into each subheader ("Activewear", "Blazers y chaquetas") as they look in the following picture but I am having trouble with a code that worked in the past:
My main goal is to download the images in each subheader (category) into a folder called "images" in a sub-folder with the category name (activewear, "blazer_y_chaquetas", etc). In the code, I want to download information from each picture, but now I have that separated in another script.
This is what I have so far:
library(tidyverse)
library(rvest)
library(httr)
url <- "https://moweek.com.uy/"
html_content <- GET(url)
webpage <- read_html(content(html_content, as = "text"))
category_nodes <- html_nodes(webpage, "a.headerOption.headerLink[data-category-id='1']")
category_urls <- html_attr(category_nodes, "href")
dir.create("images", showWarnings = FALSE)
# Loop through subcategories and download images
for (category_url in category_urls) {
# Remove any extra slashes in the URL construction
subcategory_url <- paste0(url, gsub("/+", "/", category_url))
subcategory_page <- tryCatch({
read_html(subcategory_url)
}, error = function(e) {
message(paste("Error accessing URL:", subcategory_url))
return(NULL)
})
if (is.null(subcategory_page)) {
next # Skip to the next subcategory if an error occurred
}
subcategory_name <- subcategory_page %>% html_text()
# Clean the subcategory name for folder creation
subfolder_name <- gsub(" ", "_", tolower(subcategory_name))
subfolder_path <- file.path("images", subfolder_name)
dir.create(subfolder_path, showWarnings = FALSE)
# Extract image URLs and download them into the subfolder
image_urls <- subcategory_page %>%
html_nodes(".your-image-selector") %>%
html_attr("src")
for (image_url in image_urls) {
image_name <- basename(image_url)
download.file(image_url, file.path(subfolder_path, image_name))
}
}
This worked perfectly!
pacman::p_load(tidyverse, rvest, httr)
url <- "https://moweek.com.uy"
webpage <- httr::GET(url)
webpage <- read_html(content(webpage, as = "text"))
category_nodes <- html_nodes(webpage, ".expandedCategory")
category_urls <- lapply(category_nodes, function(node) html_nodes(node, "a") %>%
html_attr("href")) %>%
unlist() %>%
str_subset("vestimenta") %>%
str_replace_all("1", "30")
category_urls <- category_urls[!(category_urls %in% c("/vestimenta/30", "/vestimenta/activewear/30",
"/vestimenta/bodywear/30", "/vestimenta/leggings-y-bikers/30",
"/vestimenta/lenceria/30", "/vestimenta/hombre/30",
"/vestimenta/ninos/30", "/vestimenta/pijamas-y-camisones/30",
"https://moweek.com.uy/vestimenta/vestidos/30",
"https://moweek.com.uy/vestimenta/trajes-de-bano/30"))]
dir.create("images", showWarnings = FALSE)
# Function to limit folder name length
limit_folder_name_length <- function(category_url, max_length = 50) {
# Extract the subcategory name from the category_url
subcategory_name <- gsub(".*/vestimenta/(.*?)/30", "\\1", category_url)
if (nchar(subcategory_name) > max_length) {
subcategory_name <- substr(subcategory_name, 1, max_length)
}
return(subcategory_name)
}
# Loop through subcategories and download images
for (category_url in category_urls) {
subcategory_name <- gsub(".*/vestimenta/(.*?)/30", "\\1", category_url)
subfolder_name <- limit_folder_name_length(gsub(" ", "_", tolower(subcategory_name)))
subfolder_path <- file.path("images", subfolder_name)
dir.create(subfolder_path, showWarnings = FALSE, recursive = TRUE)
subcategory_url <- paste0(url, gsub("/+", "/", category_url))
# Sleep for a few seconds to avoid overloading the server
Sys.sleep(3)
cat_content <- RETRY("GET", subcategory_url)
cat_page <- read_html(content(cat_content, as = "text"))
image_urls <- cat_page %>%
html_nodes(".productViewTopImage") %>%
html_attr("src")
image_urls <- ifelse(substr(image_urls, 1, 8) != "https://", paste0(url, image_urls), image_urls)
for (image_url in image_urls) {
if (!is.na(image_url) && image_url != "") {
image_name <- basename(image_url)
image_path <- file.path(subfolder_path, image_name)
# Download and save the image to the subfolder
download.file(image_url, image_path, mode = "wb")
}
}
}