Search code examples
javascripthtmlgoogle-apps-scriptweb-scrapingcheerio

XML parse : table data to Google Spreadsheet via App Script


I need to parse table data to my spreadsheet, there is no error in log, but cells in spreadsheet are blank. The problem is that I can't use built-in importhtml function, becouse the date-related data in the tag is enered dynamically on the website. I tryied to getchild getchildren but it doesn't work.

The structure of html site looks like this:

   <html>
      <head>
        <title>TITLE AAAAA</title>
        <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
        <meta content="IE=EmulateIE7" http-equiv="X-UA-Compatible">
        <style type="text/css">
          body {
            font-size: 12px;
            font-family: Arial
          }
          td {
            font-size: 12px;
            line-height: 20px;
            font-family: Arial
          }
        </style>
        <script type="text/javascript" language="javascript" src="Function.js"></script>
      </head>
      <body>
        <p align="center">
          <b>AAAA: &nbsp; AAAAAA</b>
        </p>
        <table width="300" border="0" align="center" cellpadding="1" cellspacing="1" bgcolor="#0066cc">
          <tbody>
            <tr align="center" bgcolor="#333399" class="font13">
              <td width="150">
                <b>
                  <font color="#ffffff">TO_CELL_A1_TEXT</font>
                </b>
              </td>
              <td width="150">
                <b>
                  <font color="#ffffff">TO_CELL_B1_TEXT</font>
                </b>
              </td>
              <td width="150">
                <b>
                  <font color="#ffffff">TO_CELL_C1_TEXT</font>
                </b>
              </td>
              <td width="150">
                <b>
                  <font color="#ffffff">TO_CELL_D1_TIME_TEXT</font>
                </b>
              </td>
            </tr>
            <tr align="center" bgcolor="#FFFFFF">
              <td height="20">
                <b>
                  <font color="red">TO_CELL_A2_TEXT</font>
                </b>
              </td>
              <td>
                <b>
                  <font color="red">TO_CELL_B2_TEXT</font>
                </b>
              </td>
              <td>
                <b>
                  <font color="red">TO_CELL_C2_TEXT</font>
                </b>
              </td>
              <td>
                <script>
                  showtime(2023, 01 - 1, 13, 23, 01, 12)
                </script>"TO_CELL_D2_TIME_TEXT"
              </td>
            </tr>
          </tbody>
        </table>
        <br>
        <p align="center">SITE_NAME</p>
      </body>

    </html>

Is there any solution to this problem?


Solution

  • Using the cheerio library, I solved the problem, also added the code that in the D2:D cells it replaces the values, for example: showtime(2023,01-1,20,21,09,48)" into the correct date and time format. I recommend reading the materials linked here by users @rubén and @doubleunary. If you run into a problem like me and you are a beginner, loading the Cheerio library via clasp into the cloned script on your computer will not work, because for const cheerio = require('cheerio');, require is a function available in the CJS module system that handles imports, basically. GitHub users came up against this problem and you should look for a solution to implement Cheerio in GAS, bearing in mind that these are not libraries recommended by Google, but created by users.

    Here is working script in my case:

    function importData() {
      var url = 'http://urlsite.com';
      var res = UrlFetchApp.fetch(url, {
        muteHttpExceptions: true
      }).getContentText();
    
      res = res.replace(/<script[^>]*>([\s\S]*?)<\/script>/gi, function(match, capture) {
        return capture;
      });
      const $ = Cheerio.load(res);
    
      var col1 = $('table tr td:nth-child(1)').toArray().map(x => $(x).text());
      var col2 = $('table tr td:nth-child(2)').toArray().map(x => $(x).text());
      var col3 = $('table tr td:nth-child(3)').toArray().map(x => $(x).text());
      var dateValues = $('table tr td:nth-child(4)').toArray().map(x => $(x).text());
      var table = dateValues.map((d, i) => [col1[i], col2[i], col3[i], d]);
      var range = SpreadsheetApp.getActiveSheet().getRange(1, 1, table.length, table[0].length);
      range.setValues(table);
    
      var sheet = SpreadsheetApp.getActiveSheet();
      var range = sheet.getRange("D2:D");
      var values = range.getValues();
      var convertedDates = values.map(function(value) {
        var match = value[0].match(/showtime\((\d+),(\d+)-1,(\d+),(\d+),(\d+),(\d+)\)/);
        if (match) {
          var year = match[1];
          var month = match[2];
          var day = match[3];
          var hour = match[4];
          var minute = match[5];
          var second = match[6];
          var date = new Date(year, month - 1, day, hour, minute, second);
          return Utilities.formatDate(date, "GMT", "yyyy-MM-dd HH:mm");
        }
        return "";
      });
      range.setValues(convertedDates.map(function(x) {
        return [x];
      }));
    }
    

    Moreover, while studying, I wrote a script to check ability to fetch data from website, what data is fetched and how does the tree of html looks like. You can use it in doc.new script.

    function addMenuTab() {
      var document = DocumentApp.getActiveDocument();
      var menu = DocumentApp.getUi().createMenu("MY PROGRAMS")
        .addItem("1_CLEAN DOC", "menuItem1Function")
        .addItem("2_GENERATE", "menuItem2Function")
        .addItem("3_BOLD RESPONSES", "menuItem3Function")
        .addToUi();
    }
    
    function menuItem1Function() {
      DocumentApp.getActiveDocument().getBody().clear();
    }
    
    function menuItem2Function() {
      var ui = DocumentApp.getUi();
      var result = ui.prompt(
        'Check ability to parse website!',
        'Please enter your url:',
        ui.ButtonSet.OK_CANCEL);
      var button = result.getSelectedButton();
      var url = result.getResponseText();
      if (button == ui.Button.OK) {
        ui.alert('Your URL is ' + url + '.');
      } else if (button == ui.Button.CANCEL) {
        ui.alert('I didn\'t get your URL.');
      } else if (button == ui.Button.CLOSE) {
        ui.alert('You closed the dialog box.');
      }
    
      var body = DocumentApp.getActiveDocument().getBody();
      body.clear();
      var response = UrlFetchApp.fetch(url);
      var responseOpts1 = UrlFetchApp.fetch(url, opts1);
      /**
      var responseOpts2 = UrlFetchApp.fetch(url, opts2);
      */
      var opts1 = {
        // methods: get, delete, patch, post, put - by default: get
        'method': 'get',
    
        // contentType: 'application/json' ;  'application/xml' ; 'application/html' - by default: application/x-www-form-urlencoded&#39
        'contentType': 'text/javascript',
    
        // key/value map of the HTTP headers for the request;
        'headers': '1',
      }
    
      /**
      var Opts2 = {
      //methods: get, delete, patch, post, put - by default: get
      'method' : 'get',
      'payload' : 'formdata'
      }
      */
    
      var responseCode = response.getResponseCode();
      Logger.log("Response Code: " + responseCode);
    
      var headers = response.getHeaders();
      Logger.log("Headers: " + headers);
    
      var headersText = JSON.stringify(response.getHeaders());
      Logger.log("HeadersText: " + headersText);
    
      var contentLength = response.getContentText().length;
      Logger.log("Content Length: " + contentLength);
    
      var allHeaders = response.getAllHeaders();
      Logger.log("All Headers: " + allHeaders);
    
      var allHeadersText = JSON.stringify(response.getAllHeaders());
      Logger.log("All HeadersText: " + allHeadersText);
    
      var document = DocumentApp.getActiveDocument();
      var body = document.getBody();
    
      var contentText = response.getContentText();
      Logger.log("Content Text: " + contentText);
    
      var contentText2 = responseOpts1.getContentText();
      Logger.log("Content Text & Opts1: " + contentText2);
    
      /**
      var contentText3 = responseOpts1.getContentText();
      // Replace html elements
      contentText3 = contentText3.replace(/(<(?=link|meta)[^>]*)(?<!\/)>/ig, '$1/>');
      contentText3 = contentText3.replace(/&(?!amp;)/ig, '&amp;');
      contentText3 = contentText3.replace(/ /ig, " ");
      contentText3 = contentText3.replace(/<table[^>]*>/ig, "<table>");
      contentText3 = contentText3.replace(/<tr[^>]*>/ig, "<tr>");
      contentText3 = contentText3.replace(/width[^>]*>/ig, "<width>");
      contentText3 = contentText3.replace(/<td[^>]*>/ig, "<td>");
      contentText3 = contentText3.replace(/<font[^>]*>/ig, "<font>");
      contentText3 = contentText3.replace(/<p[^>]*>/ig, "<p>");
      contentText3 = contentText3.replace(/width[^>]*>/ig, "<width>");
      contentText3 = contentText3.replace(/<br>|<\/br>/ig, "");
      contentText3 = contentText3.replace(/<style[^>]*>/ig, "<style>");
      contentText3 = contentText3.replace(/(<(p|script|style)[^>]*>)/ig, '$1<![CDATA[').replace(/(<\/(p|script|style)[^>]*>)/ig, ']]>$1')
      Logger.log("Content Text & Opts: " + contentText3);
    
      var contentText4 = XmlService.parse("<!DOCTYPE html>" + contentText3);
      var root = contentText4.getRootElement().getChild("body");
      var table = root.getChild("table");
      var tbody = table.getChild("tbody");
      var tr = table.getChild("tr");
      var td = tr.getChild("td"); 
      var font = td.getChildren("font"); 
      var script = td.getChildren("script"); 
      */
    
    
      body.appendParagraph("Response Code: " + responseCode);
      body.appendParagraph("");
      body.appendParagraph("Headers: " + headers);
      body.appendParagraph("");
      body.appendParagraph("HeadersText: " + headersText);
      body.appendParagraph("");
      body.appendParagraph("Content Length: " + contentLength);
      body.appendParagraph("");
      body.appendParagraph("All Headers: " + allHeaders);
      body.appendParagraph("");
      body.appendParagraph("All HeadersText: " + allHeadersText);
      body.appendParagraph("");
      body.appendParagraph("Content Text: " + contentText);
      body.appendParagraph("");
      /** 
      body.appendParagraph("Content Text & Opts1: " + contentText2);
      body.appendParagraph("");
      body.appendParagraph("Content Text3: " + contentText3);
      body.appendParagraph("");
      body.appendParagraph("Content Text4: " + contentText4);
      body.appendParagraph("");
      body.appendParagraph("Root: " + root);
      body.appendParagraph("");
      body.appendParagraph("Table: " + table);
      body.appendParagraph("");
      body.appendParagraph("Tbody: " + tbody);
      body.appendParagraph("");
      body.appendParagraph("Tr: " + tr);
      body.appendParagraph("");
      body.appendParagraph("Td: " + td);
      body.appendParagraph("");
      body.appendParagraph("font: " + font);
      body.appendParagraph("");
      body.appendParagraph("script: " + script);
      body.appendParagraph("");
      */
    }
    
    function menuItem3Function() {
      var doc = DocumentApp.getActiveDocument();
      var text = doc.getBody().getText();
      var searchString1 = "Response Code: ";
      var searchIndex = text.indexOf(searchString1);
      var responseCode = text.substring(searchIndex + searchString1.length, searchIndex + searchString1.length + 3);
      var textSearched = doc.getBody().findText(searchString1 + responseCode);
      var element1 = textSearched.getElement();
      if (responseCode === "200") {
        element1.setBold(true);
        element1.setForegroundColor("#9cd161");
      } else {
        element1.setBold(true);
        element1.setForegroundColor("#ff0000");
      }
    
      var body = doc.getBody();
      var searchString = ["Headers", "HeadersText", "Content Length", "All Headers", "All HeadersText", "Content Text", "Content Text & Opts:", "Content Text3:"];
      for (var i = 0; i < searchString.length; i++) {
        var searchResult = body.findText(searchString[i]);
        while (searchResult !== null) {
          var searchResultElement = searchResult.getElement();
          var start = searchResult.getStartOffset();
          var end = searchResult.getEndOffsetInclusive();
          searchResultElement.setBold(start, end, true);
          searchResult = body.findText(searchString[i], searchResult);
        }
      }
    }