I need to convert no compliant html into xml so I can process thousands of documents in the required json format with XSLT.
<!-- IMAGECOUNT -->
<table width="100%">
<tr>
<td colspan="1" rowspan="1">
<strong>
2 Images
</strong>
</td>
</tr>
</table>
<!-- /IMAGECOUNT -->
<p>
<strong>
<!-- SECTION -->
The section
<!-- /SECTION -->
<!-- COUNTRY -->
The country
<!-- /COUNTRY -->
</strong>
</p>
<!-- DATE -->
<p>
<font size="-1">
<b>Date Posted: 09-Dec-2019</b>
</font>
</p>
<!-- /DATE -->
<!-- TEXT -->
<center>
<p>
<font size="-1">
Just some text
</font>
</p>
</center>
<!-- /TEXT -->
<!-- TOP THUMBNAILS -->
<table class="tabletopbottom" width="100%">
<tr>
<td colspan="1" rowspan="1">
<img src="images/s1353556.jpg" alt="Cat"/>
<img src="images/s1164352.jpg" alt="Dog"/>
</td>
</tr>
</table>
<!-- /TOP THUMBNAILS -->
I need to add structure to the HTML so that I can use a another XSLT to remove all the elements that are not important final Json.
This seems like a group-with @starting-with problem but I can't get the logic to gobble up the following-siblings until the next matching end comment is found.
Start Comment
<!-- IMAGECOUNT -->
Lots of content that needs to be children of this new element
<!-- /IMAGECOUNT -->
Here my latest attempt and unsuccessful attempt
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="3.0">
<!-- Identity transform template -->
<xsl:template match="@* | node()">
<xsl:copy>
<xsl:apply-templates select="@* | node()"/>
</xsl:copy>
</xsl:template>
<!-- Match opening comments and create elements -->
<xsl:template match="comment()" priority="10">
<xsl:variable name="commentContent" select="normalize-space(.)"/>
<xsl:message>Processing comment: <xsl:value-of select="$commentContent"/></xsl:message>
<xsl:choose>
<xsl:when test="starts-with($commentContent, '/')">
<xsl:message> Ignoring closing comment: <xsl:value-of select="$commentContent"/></xsl:message>
</xsl:when>
<xsl:otherwise>
<xsl:message>Creating element for comment: <xsl:value-of select="$commentContent"/></xsl:message>
<xsl:element name="{replace($commentContent, ' ', '')}">
<xsl:apply-templates select="following-sibling::node()[1][not(self::comment())]"/>
</xsl:element>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
</xsl:stylesheet>
A first prototype to approach this with for-each-group group-starting-with/group-ending-with
is
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
version="3.0"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
exclude-result-prefixes="#all"
expand-text="yes">
<xsl:output method="html" indent="no" html-version="5"/>
<xsl:mode on-no-match="shallow-copy"/>
<xsl:template match="*[comment()]">
<xsl:copy>
<xsl:apply-templates select="@*"/>
<xsl:for-each-group select="node()" group-starting-with="comment()[not(matches(., '\s*/[a-z]+', 'i'))]">
<xsl:choose>
<xsl:when test="self::comment()">
<xsl:variable name="comment-name" select="replace(., '^\s+|\s+$', '')"/>
<xsl:for-each-group select="tail(current-group())" group-ending-with="comment()[matches(., '/' || $comment-name)]">
<xsl:choose>
<xsl:when test="current-group()[last()][self::comment()]">
<xsl:element name="{replace($comment-name, '\s+', '')}">
<xsl:apply-templates select="current-group()[not(position() = last())]"/>
</xsl:element>
</xsl:when>
<xsl:otherwise>
<xsl:apply-templates select="current-group()"/>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each-group>
</xsl:when>
<xsl:otherwise>
<xsl:apply-templates select="current-group()"/>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each-group>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>