Search code examples
rweb-scrapingrvesthttr

Downloading images from web and its attributes in R


I want to download the images of this site https://moweek.com.uy/ in R, specifically, those under the class = "expandedCategory": "VESTIMENTA", "CALZADO", "ACCESORIOS" (data-category-id="1", data-category-id="2", and data-category-id="3"). For example, go into the sub categories () under "VESTIMENTA" (except from "Ver todo") like "Activewear" (<a data-category-id="57" href="/vestimenta/activewear/1" class=" categoryLevelTwoTitle selected'>Activewear == $0) clicking in all the images and obtain all of the information in a dataset as well as the images.

I've trying to do this but I am failing.

pacman::p_load(tidyverse, rvest, httr)

url <- "https://moweek.com.uy/"
html_content <- GET(url)
webpage <- read_html(content(html_content, as = "text"))

category_nodes <- html_nodes(webpage, ".expandedCategory")

category_urls <- lapply(category_nodes, function(node) html_nodes(node, "a") %>% 
                          html_attr("href")) %>% 
  unlist() %>% 
  str_subset("/vestimenta/|/calzado/|/accesorios/")
                 
image_data <- data.frame()
for (url in category_urls) {
  cat_url <- paste0("https://moweek.com.uy", url)
  cat_content <- GET(cat_url)
  cat_page <- read_html(content(cat_content, as = "text"))
  
  # Extract the category name and the subcategory name from the page title
  cat_name <- html_text(html_node(cat_page, "title")) %>%
    str_replace("- MoWeek", "") %>%
    str_to_title()
  subcat_name <- html_text(html_node(cat_page, ".categoryLevelTwoTitle")) %>%
    str_to_title()
  
  # Extract the image information from the page and store it in a data frame
  image_tags <- html_nodes(cat_page, ".productItem")
  image_info <- data.frame()
  for (tag in image_tags) {
    name <- html_text(html_node(tag, ".productTitle"))
    price <- html_text(html_node(tag, ".priceText"))
    img_url <- html_attr(html_node(tag, "img"), "src")
    image_info <- image_info %>%
      add_row(Category = cat_name,
              Subcategory = subcat_name,
              Name = name,
              Price = price,
              Image_URL = img_url)
  }
  image_data <- image_data %>% bind_rows(image_info)
}

for (i in 1:nrow(image_data)) {
  img <- GET(image_data$Image_URL[i])
  category <- image_data$Category[i]
  subcategory <- image_data$Subcategory[i]
  category_folder <- gsub(" ", "_", category)
  subcategory_folder <- gsub(" ", "_", subcategory)
  
  if (!dir.exists(category_folder)) {
    dir.create(category_folder)
  }
  if (!dir.exists(paste0(category_folder, "/", subcategory_folder))) {
    dir.create(paste0(category_folder, "/", subcategory_folder))
  }
  
  file_name <- paste
}

Solution

  • This code seemed to work for me.

    library(tidyverse) 
    library(rvest)
    library(httr)
    
    url <- "https://moweek.com.uy/"
    html_content <- GET(url)
    webpage <- read_html(content(html_content, as = "text"))
    
    category_nodes <- html_nodes(webpage, ".expandedCategory")
    
    category_urls <- lapply(category_nodes, function(node) html_nodes(node, "a") %>% 
                              html_attr("href")) %>% 
      unlist() %>% 
      str_subset("/vestimenta/|/calzado/|/accesorios/")
    
    image_data <- tibble()
    for (url in category_urls) {
      cat_url <- paste0("https://moweek.com.uy", url)
      # Because we are sending so many requests, some are likely to fail
      cat_content <- RETRY("GET", cat_url)
      cat_page <- read_html(content(cat_content, as = "text"))
      
      # Extract the category name and the subcategory name from the page title
      cat_name <- html_text(html_node(cat_page, "title")) %>%
        str_replace("- MoWeek", "") %>%
        str_to_title()
      subcat_name <- html_text(html_node(cat_page, ".categoryLevelTwoTitle")) %>%
        str_to_title()
      
      # Extract the image information from the page and store it in a data frame
      image_tags <- html_nodes(cat_page, ".productViewContainer")
      # Initialise columns so bind_rows() works
      image_info <- tibble(
        Category = character(),
        Subcategory = character(),
        Name = character(),
        Price = character(),
        Image_URL = character()
      )
      for (tag in image_tags) {
        name <- html_text(html_node(tag, ".productViewName"))
        price <- html_text(html_node(tag, ".productViewPrice"))
        img_url <- html_attr(html_node(tag, ".productViewTop"), "data-hover-image")
        image_info <- image_info %>%
          add_row(Category = cat_name,
                  Subcategory = subcat_name,
                  Name = name,
                  Price = price,
                  Image_URL = img_url)
      }
      image_data <- image_data %>% bind_rows(image_info)
    }
    
    # Clean data
    image_data <- image_data %>%
      mutate(
        Name = str_trim(Name),
        Price = str_trim(Price)
      )
    
    # Get the file name of each image
    file_names <- str_match(image_data$Image_URL, ".+(/.*?$)")[,2] %>%
      str_sub(start = 2L)
    
    for (i in seq_len(nrow(image_data))) {
      url <- image_data$Image_URL[i]
      category <- image_data$Category[i]
      subcategory <- image_data$Subcategory[i]
      category_folder <- gsub(" ", "_", category)
      subcategory_folder <- gsub(" ", "_", subcategory)
      
      if (!dir.exists(category_folder)) {
        dir.create(category_folder)
      }
      if (!dir.exists(paste0(category_folder, "/", subcategory_folder))) {
        dir.create(paste0(category_folder, "/", subcategory_folder))
      }
      
      # Download and store image in correct directory
      dir_name <- paste0(category_folder, "/", subcategory_folder, "/")
      file_name <- file_names[i]
      download.file(url, paste0(dir_name, file_name))
    }