Search code examples
htmlgoogle-apps-scriptgoogle-sheetsweb-scrapinghtml-table

Google Apps Script/ Google Sheets Problem with WebScrapping in Tables


Hope you are doing well. Sorry for bad english.

I am facing an issue while doing Web Scraping with Google Apps Script. The problem I'm encountering is that when searching for a specific part of the HTML, it extracts it correctly but doesn't join it with the rest of the information correctly. I believe the problem lies in the fact that in some cases that part exists in the HTML, and in others, it doesn't. For those that don't exist, I want it to display an "X", and for the rest, I want it to bring the information normally. Here is the code I attempted and a sample of how the data is currently coming and how I would like it to be displayed.

The code:

function webScrapingPP() {
  var sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Buscador Palabras Claves");
  var keyword = sheet.getRange("B3").getValue();
  var baseUrl = "https://listado.mercadolibre.com.uy/" + keyword + "_OrderId_PRICE_ITEM*CONDITION_2230284_NoIndex_True";

  var response = UrlFetchApp.fetch(baseUrl);
  var content = response.getContentText();

  var namePattern = /<h2 aria-level="3" class="ui-search-item__title">([^<]+)<\/h2>/g;
  var pricePattern = /<span class="andes-visually-hidden">(\d+)\s*pesos<\/span>/g;
  var linkPattern = /<a href="([^"]+)" class="ui-search-item__group__element ui-search-link"/g;
  var ratingPattern = /<span class="andes-visually-hidden">Calificación ([\d.]+) de 5. (\d+) opiniones.<\/span>/g;

  var names = content.match(namePattern);
  var prices = content.match(pricePattern);
  var links = content.match(linkPattern);
  var ratings = content.match(ratingPattern);

  var data = [];

  if (names && prices && links) {
    for (var i = 0; i < Math.min(names.length, prices.length, links.length); i++) {
      var name = names[i].match(/<h2 aria-level="3" class="ui-search-item__title">([^<]+)<\/h2>/);
      var price = prices[i].match(/(\d+ pesos)/);
      var link = links[i].match(/href="([^"]+)"/);

      // Check if there is a matching rating and review count
      var rating = "X";
      var reviewCount = "X";
      if (ratings && ratings[i]) {
        var ratingMatch = ratings[i].match(/Calificación ([\d.]+) de 5. (\d+) opiniones/);
        if (ratingMatch && ratingMatch.length >= 3) {
          rating = ratingMatch[1];
          reviewCount = ratingMatch[2];
        }
      }

      // Add the data to the array
      data.push([name[1], price[0], link[1], rating, reviewCount]);
    }
  }

  // Write the data to the sheet from row 6 onwards
  sheet.getRange(6, 2, data.length, 5).setValues(data);
}

The result :

enter image description here

Desired Result:

enter image description here

link we are trying to Scrap: https://listado.mercadolibre.com.uy/banera-spa-tina-portatil-plegable-inflable-pvc_OrderId_PRICE_ITEM*CONDITION_2230284_NoIndex_True

Keywords Example: banera spa tina portatil plegable inflable pvc


Solution

  • I change the code in order to use the link of the product to do another check inside the link to get the rating and review numbers:

    function webScrapingPP() {
      var sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Buscador Palabras Claves");
      var keyword = sheet.getRange("B3").getValue();
      var baseUrl = "https://listado.mercadolibre.com.uy/" + keyword + "_OrderId_PRICE_ITEM*CONDITION_2230284_NoIndex_True";
    
      sheet.getRange("B6:F").clearContent(); // Se agregaron dos columnas adicionales para calificación y cantidad de reseñas
    
      var response = UrlFetchApp.fetch(baseUrl);
      var content = response.getContentText();
    
      var namePattern = /<h2 aria-level="3" class="ui-search-item__title">([^<]+)<\/h2>/g;
      var pricePattern = /aria-label="(\d+) pesos"/g;
      var linkPattern = /<a href="([^"]+)" class="ui-search-item__group__element ui-search-link"/g;
    
      var names = content.match(namePattern);
      var prices = content.match(pricePattern);
      var links = content.match(linkPattern);
    
      var data = [];
    
      if (names && prices && links) {
        for (var i = 0; i < Math.min(names.length, prices.length, links.length); i++) {
          var name = names[i].match(/<h2 aria-level="3" class="ui-search-item__title">([^<]+)<\/h2>/);
          var price = prices[i].match(/aria-label="(\d+) pesos/);
          var link = links[i].match(/href="([^"]+)"/);
    
          // Obtener la calificación y cantidad de reseñas
          var productInfo = getProductInfo(link[1]);
    
          // Add the data to the array
          data.push([name[1], price[1], link[1], productInfo.rating, productInfo.reviews]);
        }
      }
    
      // Write the data to the sheet from row 6 onwards
      sheet.getRange(6, 2, data.length, 5).setValues(data);
    }
    
    function getProductInfo(productUrl) {
      var options = {
        followRedirects: false,  // Desactivar las redirecciones automáticas para capturar la URL redirigida
        muteHttpExceptions: true  // Evitar que se arrojen excepciones por errores HTTP
      };
      var response = UrlFetchApp.fetch(productUrl, options);
      
      if (response.getResponseCode() == 302) {
        // Si la respuesta es una redirección (código 302), obtén la nueva ubicación
        var newLocation = response.getHeaders()["Location"];
        
        // Realiza una nueva solicitud a la nueva ubicación
        response = UrlFetchApp.fetch(newLocation);
      }
      var content = response.getContentText();
    
      // Patrón para buscar la calificación y cantidad de reseñas
      var ratingPattern = /<span aria-hidden="true" class="ui-pdp-review__rating">([\d.]+)<\/span>/;
      var reviewsPattern = /<span aria-hidden="true" class="ui-pdp-review__amount">\((\d+)\)<\/span>/;
    
      var ratingMatch = content.match(ratingPattern);
      var reviewsMatch = content.match(reviewsPattern);
    
      var rating = ratingMatch ? ratingMatch[1] : "X";
      var reviews = reviewsMatch ? reviewsMatch[1] : "X";
    
      return {
        rating: rating,
        reviews: reviews
      };
    }