Search code examples
xmlxslt

How do I merge 2 XML files using XSLT, without duplicates?


I have got a users-list.xml:

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE users SYSTEM "user-list.dtd">
<users>
    <user>
        <forename>Joe</forename>
        <surname>Bloggs</surname>
        <email>[email protected]</email>
    </user>
    <user>
        <forename>Winston</forename>
        <surname>Smith</surname>
        <email>[email protected]</email>
    </user>
    ...
</users>

And a mailing-list.xml:

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mailingList SYSTEM "mailing-list.dtd">
<mailingList>
    <subscriber familyname="Smith"
                givenname="John"
                email="[email protected]"/>
    <subscriber givenname="Luke"
                familyname="Skywalker"
                email="[email protected]"/>
    ...
</mailingList>

I would like to know how to merge the mailing list into the user list, if the email doesn't already exist in the user list.

I am very new to this kind of thing and would like to learn more of it.

I have created a merge.xslt and this is what I have so far, how far off am I?

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE list [
    <!ELEMENT list (entry)+>
    <!ELEMENT entry (firstname, lastname, email)>
    <!ELEMENT firstname (#PCDATA)>
    <!ELEMENT lastname (#PCDATA)>
    <!ELEMENT email (#PCDATA)>
]>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
    <!-- Define a key named 'emails' that indexes both user and subscribe elements by their respective email addresses. -->
    <xsl:key name="emails" match="user|subscriber" use="email|@email"/>
    
    <!-- Template to process the users list in user-list.xml -->
    <xsl:template match="/users">
        <users>
            <!-- Copy all users from entire users list in user-list.xml -->
            <xsl:apply-templates select="user"/>
            <!-- Merge subscribers from the second XML, but only if their email doesn't already exist in the first -->
            <xsl:apply-templates select="document('')/mailingList/subscriber[not(key('emails', @email, /users))]"/>
        </users>
    </xsl:template>

    <!-- Template to copy users from the first XML -->
    <xsl:template match="user">
        <user>
            <xsl:apply-templates select="@*|node()"/>
        </user>
    </xsl:template>

    <!-- Template to merge subscribers from the second XML into format matching user-list.xml -->
    <xsl:template match="subscriber">
        <user>
            <forename><xsl:value-of select="@givenname"/></forename>
            <surname><xsl:value-of select="@familyname"/></surname>
            <email><xsl:value-of select="@email"/></email>
        </user>
    </xsl:template>
</xsl:stylesheet>

Thanks!


Solution

  • Assuming XSLT 1.0, I think you could do simply:

    <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="xml" indent="yes"/>
    <xsl:strip-space elements="*"/>
    
    <xsl:param name="subscribers">mailing-list.xml</xsl:param>
    
    <xsl:template match="/users">
        <xsl:copy>
            <xsl:copy-of select="user"/>
            <xsl:for-each select="document($subscribers)/mailingList/subscriber[not(@email=current()/user/email)]">
                <user>
                    <forename>
                        <xsl:value-of select="@givenname"/>
                    </forename>
                    <surname>
                        <xsl:value-of select="@familyname"/>
                    </surname>
                    <email>
                        <xsl:value-of select="@email"/>
                    </email>
                </user>
            </xsl:for-each> 
        </xsl:copy>
    </xsl:template>
    
    </xsl:stylesheet>
    

    I don't know if there is much to be gained by using a key here, given that in XSLT 1.0 keys work only in the context of the current document, and that you actually need to get the subscribers that would NOT be fetched by a key based on the existing users emails (i.e. calculate a set difference).

    But if you really wanted, you could do it like this:

    <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="xml" indent="yes"/>
    <xsl:strip-space elements="*"/>
    
    <xsl:param name="subscribers">mailing-list</xsl:param>
    <xsl:key name="subscr" match="subscriber" use="@email" />
    
    <xsl:template match="/users">
        <xsl:copy>
            <xsl:copy-of select="user"/>
            <xsl:variable name="user-mails" select="user/email" />
            <!-- switch context to the other document -->
            <xsl:for-each select="document($subscribers)">
                <!-- find duplicates -->
                <xsl:variable name="duplicates" select="key('subscr', $user-mails)" />
                <!-- get non-duplicates -->
                <xsl:for-each select="mailingList/subscriber[count(.|$duplicates) != count($duplicates)]">
                    <user>
                        <forename>
                            <xsl:value-of select="@givenname"/>
                        </forename>
                        <surname>
                            <xsl:value-of select="@familyname"/>
                        </surname>
                        <email>
                            <xsl:value-of select="@email"/>
                        </email>
                    </user>
                </xsl:for-each> 
            </xsl:for-each> 
        </xsl:copy>
    </xsl:template>
    
    </xsl:stylesheet>