I want to download the images of this site https://moweek.com.uy/ in R, specifically, those under the class = "expandedCategory": "VESTIMENTA", "CALZADO", "ACCESORIOS" (data-category-id="1", data-category-id="2", and data-category-id="3"). For example, go into the sub categories () under "VESTIMENTA" (except from "Ver todo") like "Activewear" (<a data-category-id="57" href="/vestimenta/activewear/1" class=" categoryLevelTwoTitle selected'>Activewear == $0) clicking in all the images and obtain all of the information in a dataset as well as the images.
I've trying to do this but I am failing.
pacman::p_load(tidyverse, rvest, httr)
url <- "https://moweek.com.uy/"
html_content <- GET(url)
webpage <- read_html(content(html_content, as = "text"))
category_nodes <- html_nodes(webpage, ".expandedCategory")
category_urls <- lapply(category_nodes, function(node) html_nodes(node, "a") %>%
html_attr("href")) %>%
unlist() %>%
str_subset("/vestimenta/|/calzado/|/accesorios/")
image_data <- data.frame()
for (url in category_urls) {
cat_url <- paste0("https://moweek.com.uy", url)
cat_content <- GET(cat_url)
cat_page <- read_html(content(cat_content, as = "text"))
# Extract the category name and the subcategory name from the page title
cat_name <- html_text(html_node(cat_page, "title")) %>%
str_replace("- MoWeek", "") %>%
str_to_title()
subcat_name <- html_text(html_node(cat_page, ".categoryLevelTwoTitle")) %>%
str_to_title()
# Extract the image information from the page and store it in a data frame
image_tags <- html_nodes(cat_page, ".productItem")
image_info <- data.frame()
for (tag in image_tags) {
name <- html_text(html_node(tag, ".productTitle"))
price <- html_text(html_node(tag, ".priceText"))
img_url <- html_attr(html_node(tag, "img"), "src")
image_info <- image_info %>%
add_row(Category = cat_name,
Subcategory = subcat_name,
Name = name,
Price = price,
Image_URL = img_url)
}
image_data <- image_data %>% bind_rows(image_info)
}
for (i in 1:nrow(image_data)) {
img <- GET(image_data$Image_URL[i])
category <- image_data$Category[i]
subcategory <- image_data$Subcategory[i]
category_folder <- gsub(" ", "_", category)
subcategory_folder <- gsub(" ", "_", subcategory)
if (!dir.exists(category_folder)) {
dir.create(category_folder)
}
if (!dir.exists(paste0(category_folder, "/", subcategory_folder))) {
dir.create(paste0(category_folder, "/", subcategory_folder))
}
file_name <- paste
}
This code seemed to work for me.
library(tidyverse)
library(rvest)
library(httr)
url <- "https://moweek.com.uy/"
html_content <- GET(url)
webpage <- read_html(content(html_content, as = "text"))
category_nodes <- html_nodes(webpage, ".expandedCategory")
category_urls <- lapply(category_nodes, function(node) html_nodes(node, "a") %>%
html_attr("href")) %>%
unlist() %>%
str_subset("/vestimenta/|/calzado/|/accesorios/")
image_data <- tibble()
for (url in category_urls) {
cat_url <- paste0("https://moweek.com.uy", url)
# Because we are sending so many requests, some are likely to fail
cat_content <- RETRY("GET", cat_url)
cat_page <- read_html(content(cat_content, as = "text"))
# Extract the category name and the subcategory name from the page title
cat_name <- html_text(html_node(cat_page, "title")) %>%
str_replace("- MoWeek", "") %>%
str_to_title()
subcat_name <- html_text(html_node(cat_page, ".categoryLevelTwoTitle")) %>%
str_to_title()
# Extract the image information from the page and store it in a data frame
image_tags <- html_nodes(cat_page, ".productViewContainer")
# Initialise columns so bind_rows() works
image_info <- tibble(
Category = character(),
Subcategory = character(),
Name = character(),
Price = character(),
Image_URL = character()
)
for (tag in image_tags) {
name <- html_text(html_node(tag, ".productViewName"))
price <- html_text(html_node(tag, ".productViewPrice"))
img_url <- html_attr(html_node(tag, ".productViewTop"), "data-hover-image")
image_info <- image_info %>%
add_row(Category = cat_name,
Subcategory = subcat_name,
Name = name,
Price = price,
Image_URL = img_url)
}
image_data <- image_data %>% bind_rows(image_info)
}
# Clean data
image_data <- image_data %>%
mutate(
Name = str_trim(Name),
Price = str_trim(Price)
)
# Get the file name of each image
file_names <- str_match(image_data$Image_URL, ".+(/.*?$)")[,2] %>%
str_sub(start = 2L)
for (i in seq_len(nrow(image_data))) {
url <- image_data$Image_URL[i]
category <- image_data$Category[i]
subcategory <- image_data$Subcategory[i]
category_folder <- gsub(" ", "_", category)
subcategory_folder <- gsub(" ", "_", subcategory)
if (!dir.exists(category_folder)) {
dir.create(category_folder)
}
if (!dir.exists(paste0(category_folder, "/", subcategory_folder))) {
dir.create(paste0(category_folder, "/", subcategory_folder))
}
# Download and store image in correct directory
dir_name <- paste0(category_folder, "/", subcategory_folder, "/")
file_name <- file_names[i]
download.file(url, paste0(dir_name, file_name))
}