Search code examples
xmlxsltxml-parsingxslt-1.0xslt-grouping

How do we identify a set of nodes and add that set into another where the structure of XML source vary in XSLT?


I'm trying to convert Flash based Text format into HTML based text.

There are <LI></LI> blocks in the source xml i need to add adjacent <LI> block inside a <ul> block.

<p></p>
<li></li> ------
<li></li>      | - should be wrapped with <ul> tag
<li></li> ------
<p></p>
<li></li>
<li></li>
<li></li>
<p></p>

XML Source

<root>
    <TEXTFORMAT LEADING="2">
        <P ALIGN="LEFT">
            edfg
        </P>
    </TEXTFORMAT>
    <TEXTFORMAT LEADING="2">
        <P ALIGN="LEFT">
            sdgfdsgsds
        </P>
    </TEXTFORMAT>
    <TEXTFORMAT LEADING="2">
        <LI>
            sdfgdsg
        </LI>
    </TEXTFORMAT>
    <TEXTFORMAT LEADING="2">
        <LI>
            dsgdfgdsfg
        </LI>
    </TEXTFORMAT>
    <TEXTFORMAT LEADING="2">
        <LI>
            <FONT FACE="Lato" SIZE="12" COLOR="#4B4B4B" LETTERSPACING="0" KERNING="0">errytrtyr</FONT>
        </LI>
    </TEXTFORMAT>
    <TEXTFORMAT LEADING="2">
        <P ALIGN="LEFT">
            sdgfdsgsds
        </P>
    </TEXTFORMAT>
    <TEXTFORMAT LEADING="2">
        <LI>
            <FONT FACE="System" SIZE="16" COLOR="#4B4B4B" LETTERSPACING="0" KERNING="0">nm,hjku
                <FONT FACE="Lato" SIZE="12"></FONT>
            </FONT>
        </LI>
    </TEXTFORMAT>
    <TEXTFORMAT LEADING="2">
        <LI>
            <FONT FACE="System" SIZE="16" COLOR="#4B4B4B" LETTERSPACING="0" KERNING="0">
                <B>hgjgj</B>
                <FONT FACE="Lato" SIZE="12"></FONT>
            </FONT>
        </LI>
    </TEXTFORMAT>
    <TEXTFORMAT LEADING="2">
        <P ALIGN="CENTER">
            <FONT FACE="Lato" SIZE="12" COLOR="#4B4B4B" LETTERSPACING="0" KERNING="0">centered text</FONT>
        </P>
    </TEXTFORMAT>
</root>

Expected Output

<div>
    <div style="text-align:LEFT; ">
        edfg
    </div>
    <div style="text-align:LEFT; ">
        sdgfdsgsds
    </div>
    <ul>
        <li>
            sdfgdsg
        </li>
        <li>
            dsgdfgdsfg
        </li>
        <li>
            <FONT COLOR="#4B4B4B" FACE="Lato" SIZE="12">errytrtyr</FONT>
        </li>
    </ul>
    <div style="text-align:LEFT; ">
        sdgfdsgsds
    </div>
    <ul>
        <li>
            <FONT COLOR="#4B4B4B" FACE="System" SIZE="16">nm,hjku
                <FONT FACE="Lato" SIZE="12"></FONT>
            </FONT>
        </li>
        <li>
            <FONT COLOR="#4B4B4B" FACE="System" SIZE="16">
                <B>hgjgj</B>
                <FONT FACE="Lato" SIZE="12"></FONT>
            </FONT>
        </li>
    </ul>
    <div style="text-align:CENTER; ">
        <FONT COLOR="#4B4B4B" FACE="Lato" SIZE="12">centered text</FONT>
    </div>
</div>

My code:

<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    version="1.0">

    <xsl:strip-space elements="*"/>
    <xsl:output indent="yes" method="html"/>

    <!-- identity template -->

    <xsl:template match="node()|@*">
        <xsl:copy>
            <xsl:apply-templates select="node()|@*"/>
        </xsl:copy>
    </xsl:template>

    <xsl:template match="root">
        <div>
            <xsl:apply-templates/>
        </div>
    </xsl:template>

    <!-- remove unwanted attributes -->
    <xsl:template match="@LETTERSPACING|@KERNING"/>

    <!-- Remove <P> tag and set the alignment -->
    <xsl:template match="P">
        <div>
            <xsl:attribute name="style">
                <!-- collect attributes -->
                <xsl:variable name="styles">
                    <xsl:if test="@ALIGN">
                        <xsl:value-of select="concat('text-align:', @ALIGN )"/>
                        <xsl:text>; </xsl:text>
                    </xsl:if>
                </xsl:variable>
                <!-- delete trailing spaces -->
                <xsl:value-of select="$styles"/>
            </xsl:attribute>
            <xsl:apply-templates/>
        </div>
    </xsl:template>

    <!-- Replace <LI> with <li> -->
    <xsl:template match="LI">
        <li><xsl:apply-templates/></li>
    </xsl:template>

    <!-- Remove TEXTFORMAT -->
    <xsl:template match="TEXTFORMAT">
        <xsl:apply-templates/>
    </xsl:template>

</xsl:stylesheet>

Solution

  • The following XSLT 1.0 transformation gives you what seems to be the wanted result:

    <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
        <xsl:strip-space elements="*"/>
        <xsl:output indent="yes" method="html"/>
    
        <xsl:key name="list" match="TEXTFORMAT[LI]" use="generate-id(
            (self::*|preceding-sibling::*)[LI][
                not(preceding-sibling::*[1][LI])
            ][last()]
        )" />
    
        <!-- identity template -->
        <xsl:template match="node()|@*">
            <xsl:copy>
                <xsl:apply-templates select="node()|@*"/>
            </xsl:copy>
        </xsl:template>
    
        <xsl:template match="root">
            <div>
                <xsl:apply-templates />
            </div>
        </xsl:template>
    
        <!-- Remove <P> tag and set the alignment -->
        <xsl:template match="P">
            <div>
                <xsl:attribute name="style">
                    <xsl:apply-templates select="@*" mode="css" />
                </xsl:attribute>
                <xsl:apply-templates/>
            </div>
        </xsl:template>
    
        <xsl:template match="@ALIGN" mode="css">
            <xsl:value-of select="concat('text-align:', ., ';')"/>
        </xsl:template>
        <!-- add more -->
        <xsl:template match="@*" mode="css" />
    
        <!-- remove unwanted attributes -->
        <xsl:template match="@LETTERSPACING|@KERNING"/>
    
        <xsl:template match="TEXTFORMAT[LI]">
            <xsl:variable name="adjacent" select="key('list', generate-id())" />
            <xsl:if test="$adjacent">
                <ul>
                    <xsl:apply-templates select="$adjacent/LI" />
                </ul>
            </xsl:if>
        </xsl:template>
    
        <!-- Replace <LI> with <li> -->
        <xsl:template match="LI">
            <li><xsl:apply-templates/></li>
        </xsl:template>
    
        <!-- Remove TEXTFORMAT -->
        <xsl:template match="TEXTFORMAT">
            <xsl:apply-templates/>
        </xsl:template>
    
    </xsl:stylesheet>
    

    Result:

    <div>
       <div style="text-align:LEFT;">
          edfg
    
       </div>
       <div style="text-align:LEFT;">
          sdgfdsgsds
    
       </div>
       <ul>
          <li>
             sdfgdsg
    
          </li>
          <li>
             dsgdfgdsfg
    
          </li>
          <li><FONT FACE="Lato" SIZE="12" COLOR="#4B4B4B">errytrtyr</FONT></li>
       </ul>
       <div style="text-align:LEFT;">
          sdgfdsgsds
    
       </div>
       <ul>
          <li><FONT FACE="System" SIZE="16" COLOR="#4B4B4B">nm,hjku
                <FONT FACE="Lato" SIZE="12"></FONT></FONT></li>
          <li><FONT FACE="System" SIZE="16" COLOR="#4B4B4B"><B>hgjgj</B><FONT FACE="Lato" SIZE="12"></FONT></FONT></li>
       </ul>
       <div style="text-align:CENTER;"><FONT FACE="Lato" SIZE="12" COLOR="#4B4B4B">centered text</FONT></div>
    </div>
    

    The key to the solution, quite literally, is this construction:

    <xsl:key name="list" match="TEXTFORMAT[LI]" use="generate-id(
        (self::*|preceding-sibling::*)[LI][
            not(preceding-sibling::*[1][LI])
        ][last()]
    )" />
    

    This indexes each TEXTFORMAT[LI] in the document by the unique ID of the nearest TEXTFORMAT[LI] that started the current series, i.e. the nearest one that has no TEXTFORMAT[LI] before it.

    Going from there we can decide in <xsl:template match="TEXTFORMAT[LI]"> whether to output something or not for any given TEXTFORMAT[LI].