Search code examples
xslt-2.0xslt-3.0

XSLT 2 or 3 How to convert <!--/ COMMENT --> in to encapsulating element in output markup


I need to convert no compliant html into xml so I can process thousands of documents in the required json format with XSLT.

          <!-- IMAGECOUNT -->
      <table width="100%">
         <tr>
            <td colspan="1" rowspan="1">
               <strong>
2 Images
</strong>
            </td>
         </tr>
      </table>
      <!-- /IMAGECOUNT -->
      <p>
         <strong>
<!-- SECTION -->
The section 
<!-- /SECTION -->
<!-- COUNTRY -->
The country
<!-- /COUNTRY -->


</strong>
      </p>
      <!-- DATE -->
      <p>
         <font size="-1">
            <b>Date Posted: 09-Dec-2019</b>
         </font>
      </p>
      <!-- /DATE -->
      <!-- TEXT -->
      <center>
         <p>
            <font size="-1">
Just some text
</font>
         </p>
      </center>
      <!-- /TEXT -->
      <!-- TOP THUMBNAILS -->
      <table class="tabletopbottom" width="100%">
         <tr>
            <td colspan="1" rowspan="1">
               <img src="images/s1353556.jpg" alt="Cat"/>
               <img src="images/s1164352.jpg" alt="Dog"/>
            </td>
         </tr>
      </table>
      <!-- /TOP THUMBNAILS -->

I need to add structure to the HTML so that I can use a another XSLT to remove all the elements that are not important final Json.

This seems like a group-with @starting-with problem but I can't get the logic to gobble up the following-siblings until the next matching end comment is found.

Start Comment

  <!-- IMAGECOUNT -->
    Lots of content that needs to be children of this new element
  <!-- /IMAGECOUNT -->

Here my latest attempt and unsuccessful attempt

<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="3.0">

  <!-- Identity transform template -->
  <xsl:template match="@* | node()">
  
    <xsl:copy>
      <xsl:apply-templates select="@* | node()"/>
    </xsl:copy>
  </xsl:template>

  <!-- Match opening comments and create elements -->
  <xsl:template match="comment()" priority="10">
    <xsl:variable name="commentContent" select="normalize-space(.)"/>
    <xsl:message>Processing comment: <xsl:value-of select="$commentContent"/></xsl:message>
    <xsl:choose>
      <xsl:when test="starts-with($commentContent, '/')">
        <xsl:message>   Ignoring closing comment: <xsl:value-of select="$commentContent"/></xsl:message>
      </xsl:when>
      <xsl:otherwise>
        <xsl:message>Creating element for comment: <xsl:value-of select="$commentContent"/></xsl:message>
        <xsl:element name="{replace($commentContent, ' ', '')}">
                      <xsl:apply-templates select="following-sibling::node()[1][not(self::comment())]"/>
        </xsl:element>
      </xsl:otherwise>
    </xsl:choose>
  </xsl:template>
</xsl:stylesheet>

Solution

  • A first prototype to approach this with for-each-group group-starting-with/group-ending-with is

    <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
      version="3.0"
      xmlns:xs="http://www.w3.org/2001/XMLSchema"
      exclude-result-prefixes="#all"
      expand-text="yes">
    
      <xsl:output method="html" indent="no" html-version="5"/>
    
      <xsl:mode on-no-match="shallow-copy"/>
      
      <xsl:template match="*[comment()]">
        <xsl:copy>
          <xsl:apply-templates select="@*"/>
          <xsl:for-each-group select="node()" group-starting-with="comment()[not(matches(., '\s*/[a-z]+', 'i'))]">
            <xsl:choose>
              <xsl:when test="self::comment()">
                <xsl:variable name="comment-name" select="replace(., '^\s+|\s+$', '')"/>
                <xsl:for-each-group select="tail(current-group())" group-ending-with="comment()[matches(., '/' || $comment-name)]">
                  <xsl:choose>
                    <xsl:when test="current-group()[last()][self::comment()]">
                      <xsl:element name="{replace($comment-name, '\s+', '')}">
                        <xsl:apply-templates select="current-group()[not(position() = last())]"/>
                      </xsl:element>
                    </xsl:when>
                    <xsl:otherwise>
                      <xsl:apply-templates select="current-group()"/>
                    </xsl:otherwise>
                  </xsl:choose>
                </xsl:for-each-group>
              </xsl:when>
              <xsl:otherwise>
                <xsl:apply-templates select="current-group()"/>
              </xsl:otherwise>
            </xsl:choose>
          </xsl:for-each-group>
        </xsl:copy>
      </xsl:template>
    
    
    </xsl:stylesheet>
    

    Online fiddle with Saxon HE Java in the browser.