Search code examples
performancexslt-groupingxslt-3.0

I'm using maps for streaming and grouping, need advice on how to improve the performance


My Source data is like this, and it's a really large xml of 2+ GB.

    <?xml version="1.0" encoding="UTF-8"?>
    <Journal_Lines>
        <jrnl1 CY="USD" CCD="1001" CC="11062" IsPyJrl="1" AID="11382" LAI="107709"
            TLCCr="11062" TCAmt="222.85" TDAmt="0" CDI="C" CDAmt="222.85" DN=""
            EDt="2019-06-16-07:00" SCd="" HURCl="0"/>
        <jrnl1 CY="USD" CCD="1001" CC="11062" IsPyJrl="1" AID="11382" LAI="240997"
            TLCCr="11062" TCAmt="0" TDAmt="222.85" CDI="D" CDAmt="222.85" DN=""
            EDt="2019-06-16-07:00" SCd="" HURCl="0"/>
        <jrnl1 CY="USD" CCD="1001" CC="16835" IsPyJrl="1" AID="12661" LAI="107769"
            TLCCr="16835" TCAmt="94.06" TDAmt="0" CDI="C" CDAmt="94.06" DN="" EDt="2019-06-16-07:00"
            SCd="" HURCl="0"/>
        <jrnl1 CY="USD" CCD="1001" CC="16835" IsPyJrl="1" AID="12661" LAI="240997"
            TLCCr="16835" TCAmt="0" TDAmt="94.06" CDI="D" CDAmt="94.06" DN="" EDt="2019-06-16-07:00"
            SCd="" HURCl="0"/>
        <jrnl1 CY="USD" CCD="1001" CC="19655" IsPyJrl="1" AID="12731" LAI="240997"
            TLCCr="19655" TCAmt="0" TDAmt="899.11" CDI="D" CDAmt="899.11" DN=""
            EDt="2019-06-16-07:00" SCd="" HURCl="0"/>
        <jrnl1 CY="USD" CCD="1001" CC="19655" IsPyJrl="1" AID="12731" LAI="107709"
            TLCCr="19655" TCAmt="899.11" TDAmt="0" CDI="C" CDAmt="899.11" DN=""
            EDt="2019-06-16-07:00" SCd="" HURCl="0"/>
    </Journal_Lines>

My Output is

<Journal_Lines xmlns:xs="http://www.w3.org/2001/XMLSchema"
    xmlns:map="http://www.w3.org/2005/xpath-functions/map">
    <Group CCD="1001" CC="11062">
        <Jrnln CY="USD" CCD="1001" CC="11062" IsPyJrl="1" AID="11382" LAI="107709" TLCCr="11062"
            TCAmt="222.85" TDAmt="0" CDI="C" CDAmt="222.85" DN="" EDt="2019-06-16-07:00" SCd=""
            HURCl="0"/>
        <Jrnln CY="USD" CCD="1001" CC="11062" IsPyJrl="1" AID="11382" LAI="240997" TLCCr="11062"
            TCAmt="0" TDAmt="222.85" CDI="D" CDAmt="222.85" DN="" EDt="2019-06-16-07:00" SCd=""
            HURCl="0"/>
    </Group>
    <Group CCD="1001" CC="16835">
        <Jrnln CY="USD" CCD="1001" CC="16835" IsPyJrl="1" AID="12661" LAI="107769" TLCCr="16835"
            TCAmt="94.06" TDAmt="0" CDI="C" CDAmt="94.06" DN="" EDt="2019-06-16-07:00" SCd=""
            HURCl="0"/>
        <Jrnln CY="USD" CCD="1001" CC="16835" IsPyJrl="1" AID="12661" LAI="240997" TLCCr="16835"
            TCAmt="0" TDAmt="94.06" CDI="D" CDAmt="94.06" DN="" EDt="2019-06-16-07:00" SCd=""
            HURCl="0"/>
    </Group>
    <Group CCD="1001" CC="19655">
        <Jrnln CY="USD" CCD="1001" CC="19655" IsPyJrl="1" AID="12731" LAI="240997" TLCCr="19655"
            TCAmt="0" TDAmt="899.11" CDI="D" CDAmt="899.11" DN="" EDt="2019-06-16-07:00" SCd=""
            HURCl="0"/>
        <Jrnln CY="USD" CCD="1001" CC="19655" IsPyJrl="1" AID="12731" LAI="107709" TLCCr="19655"
            TCAmt="899.11" TDAmt="0" CDI="C" CDAmt="899.11" DN="" EDt="2019-06-16-07:00" SCd=""
            HURCl="0"/>
    </Group>
</Journal_Lines>

I'm grouping and sorting by CC and CCD, my current code is below and it works. But takes a very log time.

    <?xml version="1.0" encoding="UTF-8"?>
    <xsl:stylesheet version="3.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
        xmlns:xs="http://www.w3.org/2001/XMLSchema"
        xmlns:map="http://www.w3.org/2005/xpath-functions/map">

        <xsl:output indent="no"/>
        <xsl:mode streamable="yes" on-no-match="shallow-skip"/>
        <xsl:variable name="vElementMap" as="map(*)" 
            select="map { 
            1:'CY', 2:'CCD', 3:'CC', 4:'IsPyJrl', 5:'AID',
            6:'LAI', 7:'TLCCr', 8:'TCAmt', 9:'TDAmt', 10:'CDI',
            11:'CDAmt', 12:'DN', 13:'EDt', 14:'SCd', 15:'HURCl' }"
        />

        <xsl:template match="/">
            <xsl:iterate select="Journal_Lines/jrnl1">
                <xsl:param name="mapJournalLines" as="map(xs:string, xs:string)" select="map{}"/>

                <xsl:on-completion>
                    <Journal_Lines>
                        <!-- Sort data  -->
                        <xsl:for-each select="map:for-each($mapJournalLines, function ($k, $v) {$k})">
                            <xsl:sort select="."/>
                            <Group CCD="{substring-before(.,'^')}" CC="{substring-after(.,'^')}">
                                <xsl:for-each select="tokenize($mapJournalLines(.),'\^')">
                                    <Jrnln>
                                        <xsl:for-each select="tokenize(.,'\|')">
                                            <xsl:attribute name="{$vElementMap(position())}">
                                                <xsl:value-of select="."/>
                                            </xsl:attribute>
                                        </xsl:for-each>
                                    </Jrnln>
                                </xsl:for-each>
                            </Group>                        
                        </xsl:for-each>
                    </Journal_Lines>
                </xsl:on-completion>

                <xsl:variable name="current-entry" select="copy-of()"/>
                <xsl:variable name="vKey" select="$current-entry/@CCD || '^' || $current-entry/@CC"/>
                <xsl:variable name="vValue">
                    <xsl:for-each select="$current-entry/@*">
                        <xsl:if test="position() ne 1">|</xsl:if>
                        <xsl:value-of select="."/>
                    </xsl:for-each>
                </xsl:variable>

                <xsl:next-iteration>

                    <xsl:with-param name="mapJournalLines"
                        select="
                        if (map:contains($mapJournalLines, xs:string($vKey))) then
                        map:put($mapJournalLines, xs:string($vKey), $mapJournalLines(xs:string($vKey)) || '^' || xs:string($vValue)) 
                        else 
                        map:put($mapJournalLines, xs:string($vKey), xs:string($vValue))"
                    />

                </xsl:next-iteration>
            </xsl:iterate>

        </xsl:template>

    </xsl:stylesheet>

I'm converting a single jrnl1 node to a single pipe delimited line, and multiple grouped lines are delimited by ^ This works with small load, but takes forever with large data.

Any help is appreciated.


Solution

  • The data you have being attribute centered seems to be a good input for streamable xsl:fork/xsl:for-each-group:

    <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="3.0"
        xmlns:xs="http://www.w3.org/2001/XMLSchema" exclude-result-prefixes="#all">
    
        <xsl:output indent="yes"/>
    
        <xsl:mode streamable="yes" on-no-match="shallow-skip"/>
    
        <xsl:template match="Journal_Lines">
            <xsl:copy>
                <xsl:fork>
                    <xsl:for-each-group select="jrnl1" composite="yes" group-by="@CCD, @CC">
                        <Group CCD="{current-grouping-key()[1]}" CC="{current-grouping-key()[2]}">
                            <xsl:apply-templates select="current-group()"/>
                        </Group>
                    </xsl:for-each-group>
                </xsl:fork>
            </xsl:copy>
        </xsl:template>
    
        <xsl:template match="jrnl1">
            <Jrnln>
                <xsl:copy-of select="@*"/>
            </Jrnln>
        </xsl:template>
    
    </xsl:stylesheet>
    

    Even this approach however requires the XSLT processor to buffer groups in memory as it is not decidable until the last element being processed to which group it belongs, or, in other words, it can't push out and close any group before having processed the last element. Only a group-adjacent would reduce the need to buffering (and the use of xsl:fork) but that would obviously require the input to have the elements to be grouped together already following each other.

    Using sorting is not really compatible with streamable processing in XSLT 3, I think any use of it breaks any streamability analysis, you would need to throw in copy-of() and I am not sure it has any advantage then over traditional XSLT:

    <xsl:template match="Journal_Lines">
        <xsl:copy>
                <xsl:for-each-group select="jrnl1!copy-of()" composite="yes" group-by="@CCD, @CC">
                    <xsl:sort select="current-grouping-key()[1]"/>
                    <xsl:sort select="current-grouping-key()[2]"/>
                    <Group CCD="{current-grouping-key()[1]}" CC="{current-grouping-key()[2]}">
                        <xsl:apply-templates select="current-group()"/>
                    </Group>
                </xsl:for-each-group>          
        </xsl:copy>
    </xsl:template>
    

    It might be worth looking into dedicated XML database systems like eXist or BaseX if their (mainly XQuery based) processing allows a more performant and less memory intensive sorting and grouping than a standalone XSLT 3 processor.

    For your current approach with the maps of strings you concatenate and tokenize it might be worth checking whether using nested arrays or a nesting of array/sequence performs better or perhaps just storing the elements you have already copy-of()ed is also faster than concatenating and splitting strings:

    <xsl:stylesheet version="3.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
        xmlns:xs="http://www.w3.org/2001/XMLSchema"
        xmlns:map="http://www.w3.org/2005/xpath-functions/map">
    
        <xsl:output indent="yes"/>
        <xsl:mode streamable="yes" on-no-match="shallow-skip"/>
        <xsl:variable name="vElementMap" as="map(*)" 
            select="map { 
            1:'CY', 2:'CCD', 3:'CC', 4:'IsPyJrl', 5:'AID',
            6:'LAI', 7:'TLCCr', 8:'TCAmt', 9:'TDAmt', 10:'CDI',
            11:'CDAmt', 12:'DN', 13:'EDt', 14:'SCd', 15:'HURCl' }"
        />
    
        <xsl:template match="/">
            <xsl:iterate select="Journal_Lines/jrnl1">
                <xsl:param name="mapJournalLines" as="map(xs:string, element(jrnl1)*)" select="map{}"/>
    
                <xsl:on-completion>
                    <Journal_Lines>
                        <!-- Sort data  -->
                        <xsl:for-each select="map:keys($mapJournalLines)">
                            <xsl:sort select="."/>
                            <Group CCD="{substring-before(.,'^')}" CC="{substring-after(.,'^')}">
                                <xsl:for-each select="$mapJournalLines(.)">
                                    <Jrnln>
                                        <xsl:copy-of select="@*"/>
                                    </Jrnln>
                                </xsl:for-each>
                            </Group>                        
                        </xsl:for-each>
                    </Journal_Lines>
                </xsl:on-completion>
    
                <xsl:variable name="current-entry" select="copy-of()"/>
                <xsl:variable name="vKey" as="xs:string" select="$current-entry/@CCD || '^' || $current-entry/@CC"/>
    
                <xsl:next-iteration>
    
                    <xsl:with-param name="mapJournalLines"
                        select="
                        if (map:contains($mapJournalLines, $vKey)) then
                        map:put($mapJournalLines, $vKey, ($mapJournalLines($vKey), $current-entry)) 
                        else 
                        map:put($mapJournalLines, $vKey, $current-entry)"
                    />
    
                </xsl:next-iteration>
            </xsl:iterate>
    
        </xsl:template>
    
    </xsl:stylesheet>
    

    Finally, to keep your orginal approach of a map of string data but to avoid all the concatenation and tokenizing you could try a map(xs:string, array(xs:string)*), that is, a map storing each group of data as a sequence of string arrays where each array represents a line in your final output:

    <xsl:stylesheet version="3.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
        xmlns:xs="http://www.w3.org/2001/XMLSchema"
        xmlns:map="http://www.w3.org/2005/xpath-functions/map">
    
        <xsl:output indent="yes"/>
        <xsl:mode streamable="yes" on-no-match="shallow-skip"/>
        <xsl:variable name="vElementMap" as="map(*)" 
            select="map { 
            1:'CY', 2:'CCD', 3:'CC', 4:'IsPyJrl', 5:'AID',
            6:'LAI', 7:'TLCCr', 8:'TCAmt', 9:'TDAmt', 10:'CDI',
            11:'CDAmt', 12:'DN', 13:'EDt', 14:'SCd', 15:'HURCl' }"
        />
    
        <xsl:template match="/">
            <xsl:iterate select="Journal_Lines/jrnl1">
                <xsl:param name="mapJournalLines" as="map(xs:string, array(xs:string)*)" select="map{}"/>
    
                <xsl:on-completion>
                    <Journal_Lines>
                        <!-- Sort data  -->
                        <xsl:for-each select="map:keys($mapJournalLines)">
                            <xsl:sort select="."/>
                            <Group CCD="{substring-before(.,'^')}" CC="{substring-after(.,'^')}">
                                <xsl:for-each select="$mapJournalLines(.)">
                                    <Jrnln>
                                        <xsl:for-each select="?*">
                                            <xsl:attribute name="{$vElementMap(position())}">
                                                <xsl:value-of select="."/>
                                            </xsl:attribute>
                                        </xsl:for-each>
                                    </Jrnln>
                                </xsl:for-each>
                            </Group>                        
                        </xsl:for-each>
                    </Journal_Lines>
                </xsl:on-completion>
    
                <xsl:variable name="vKey" as="xs:string" select="@CCD || '^' || @CC"/>
    
                <xsl:variable name="vValue" as="array(xs:string)*" select="array { @*!string() }"/>
    
                <xsl:next-iteration>
    
                    <xsl:with-param name="mapJournalLines"
                        select="
                        if (map:contains($mapJournalLines, $vKey)) then
                        map:put($mapJournalLines, $vKey, ($mapJournalLines($vKey), $vValue)) 
                        else 
                        map:put($mapJournalLines, $vKey, $vValue)"
                    />
    
                </xsl:next-iteration>
            </xsl:iterate>
    
        </xsl:template>
    
    </xsl:stylesheet>