Hope you are doing well. Sorry for bad english.
I am facing an issue while doing Web Scraping with Google Apps Script. The problem I'm encountering is that when searching for a specific part of the HTML, it extracts it correctly but doesn't join it with the rest of the information correctly. I believe the problem lies in the fact that in some cases that part exists in the HTML, and in others, it doesn't. For those that don't exist, I want it to display an "X", and for the rest, I want it to bring the information normally. Here is the code I attempted and a sample of how the data is currently coming and how I would like it to be displayed.
The code:
function webScrapingPP() {
var sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Buscador Palabras Claves");
var keyword = sheet.getRange("B3").getValue();
var baseUrl = "https://listado.mercadolibre.com.uy/" + keyword + "_OrderId_PRICE_ITEM*CONDITION_2230284_NoIndex_True";
var response = UrlFetchApp.fetch(baseUrl);
var content = response.getContentText();
var namePattern = /<h2 aria-level="3" class="ui-search-item__title">([^<]+)<\/h2>/g;
var pricePattern = /<span class="andes-visually-hidden">(\d+)\s*pesos<\/span>/g;
var linkPattern = /<a href="([^"]+)" class="ui-search-item__group__element ui-search-link"/g;
var ratingPattern = /<span class="andes-visually-hidden">Calificación ([\d.]+) de 5. (\d+) opiniones.<\/span>/g;
var names = content.match(namePattern);
var prices = content.match(pricePattern);
var links = content.match(linkPattern);
var ratings = content.match(ratingPattern);
var data = [];
if (names && prices && links) {
for (var i = 0; i < Math.min(names.length, prices.length, links.length); i++) {
var name = names[i].match(/<h2 aria-level="3" class="ui-search-item__title">([^<]+)<\/h2>/);
var price = prices[i].match(/(\d+ pesos)/);
var link = links[i].match(/href="([^"]+)"/);
// Check if there is a matching rating and review count
var rating = "X";
var reviewCount = "X";
if (ratings && ratings[i]) {
var ratingMatch = ratings[i].match(/Calificación ([\d.]+) de 5. (\d+) opiniones/);
if (ratingMatch && ratingMatch.length >= 3) {
rating = ratingMatch[1];
reviewCount = ratingMatch[2];
}
}
// Add the data to the array
data.push([name[1], price[0], link[1], rating, reviewCount]);
}
}
// Write the data to the sheet from row 6 onwards
sheet.getRange(6, 2, data.length, 5).setValues(data);
}
The result :
Desired Result:
link we are trying to Scrap: https://listado.mercadolibre.com.uy/banera-spa-tina-portatil-plegable-inflable-pvc_OrderId_PRICE_ITEM*CONDITION_2230284_NoIndex_True
Keywords Example: banera spa tina portatil plegable inflable pvc
I change the code in order to use the link of the product to do another check inside the link to get the rating and review numbers:
function webScrapingPP() {
var sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Buscador Palabras Claves");
var keyword = sheet.getRange("B3").getValue();
var baseUrl = "https://listado.mercadolibre.com.uy/" + keyword + "_OrderId_PRICE_ITEM*CONDITION_2230284_NoIndex_True";
sheet.getRange("B6:F").clearContent(); // Se agregaron dos columnas adicionales para calificación y cantidad de reseñas
var response = UrlFetchApp.fetch(baseUrl);
var content = response.getContentText();
var namePattern = /<h2 aria-level="3" class="ui-search-item__title">([^<]+)<\/h2>/g;
var pricePattern = /aria-label="(\d+) pesos"/g;
var linkPattern = /<a href="([^"]+)" class="ui-search-item__group__element ui-search-link"/g;
var names = content.match(namePattern);
var prices = content.match(pricePattern);
var links = content.match(linkPattern);
var data = [];
if (names && prices && links) {
for (var i = 0; i < Math.min(names.length, prices.length, links.length); i++) {
var name = names[i].match(/<h2 aria-level="3" class="ui-search-item__title">([^<]+)<\/h2>/);
var price = prices[i].match(/aria-label="(\d+) pesos/);
var link = links[i].match(/href="([^"]+)"/);
// Obtener la calificación y cantidad de reseñas
var productInfo = getProductInfo(link[1]);
// Add the data to the array
data.push([name[1], price[1], link[1], productInfo.rating, productInfo.reviews]);
}
}
// Write the data to the sheet from row 6 onwards
sheet.getRange(6, 2, data.length, 5).setValues(data);
}
function getProductInfo(productUrl) {
var options = {
followRedirects: false, // Desactivar las redirecciones automáticas para capturar la URL redirigida
muteHttpExceptions: true // Evitar que se arrojen excepciones por errores HTTP
};
var response = UrlFetchApp.fetch(productUrl, options);
if (response.getResponseCode() == 302) {
// Si la respuesta es una redirección (código 302), obtén la nueva ubicación
var newLocation = response.getHeaders()["Location"];
// Realiza una nueva solicitud a la nueva ubicación
response = UrlFetchApp.fetch(newLocation);
}
var content = response.getContentText();
// Patrón para buscar la calificación y cantidad de reseñas
var ratingPattern = /<span aria-hidden="true" class="ui-pdp-review__rating">([\d.]+)<\/span>/;
var reviewsPattern = /<span aria-hidden="true" class="ui-pdp-review__amount">\((\d+)\)<\/span>/;
var ratingMatch = content.match(ratingPattern);
var reviewsMatch = content.match(reviewsPattern);
var rating = ratingMatch ? ratingMatch[1] : "X";
var reviews = reviewsMatch ? reviewsMatch[1] : "X";
return {
rating: rating,
reviews: reviews
};
}