Search code examples
xmldoctypexslt-3.0xml-entities

Copying an XML file including doctype, entities and notations XSLT 3


I have a series of XML documents that are being copied from one folder into another, transformed with msxsl.exe 1.1.0.1 and an XSLT 1.0 stylesheet, then copied back to the original folder. I don't know why the doctype, entities and notations are not being copied over but currently they are being inserted with javascript in the style sheet. I have to replace the javascript with XSLT 3.0 so it will work with saxon HE11.

The doctype is the highest element in the XML, this is also my desired output:

  <!DOCTYPE dmodule [
  <!ENTITY ICN-XXX12-001-01 SYSTEM "ICN-XXX12-001-01.SWF" NDATA swf >
  <!ENTITY ICN-XXX49-001-01 SYSTEM "ICN-XXX49-001-01 SYSTEM.CGM" NDATA cgm >
  <!ENTITY ICN-AAA235-000000-0-A-001-01 SYSTEM "ICN-AAA235-000000-0-A-001-01.wrlzip" NDATA WRLZIP>
  <!NOTATION cgm PUBLIC "-//USA-DOD//NOTATION Computer Graphics Metafile//EN" >
  <!NOTATION swf PUBLIC "-//S1000D//NOTATION X-SHOCKWAVE-FLASH 3D Models Encoding//EN" >
  <!NOTATION WRLZIP SYSTEM "WRLZIP">
]>
<dmodule xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xmlns:dc="http://www.purl.org/dc/elements/1.1/"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns:xlink="http://www.w3.org/1999/xlink"
         xsi:noNamespaceSchemaLocation="../schema/proced.xsd">
         <content>
            <figure>
              <title/>
                <graphic infoEntityIdent="ICN-XXX49-001-01"/>
            </figure>
            <proceduralStep>
                <para>Check the brake system function.</para>
                <multimedia>
                   <title>Brake function</title>
                   <multimediaObject autoPlay="1" fullScreen="0" infoEntityIdent="ICN-XXX12-001-01" multimediaType="other"/>
                </multimedia>
             </proceduralStep>
             <multimedia>
                 <multimediaObject infoEntityIdent="ICN-AAA235-000000-0-A-001-01"
                        multimediaType="3D"
                        xlink:href="ICN-AAA235-000000-0-A-001-01.wrlzip"
                        xlink:type="simple"/>
                </multimedia>
         </content>
</dmodule>

The entities are referenced on @infoEntityIdent from various elements but there is not always an indication of the type of file:

    <graphic infoEntityIdent="ICN-XXX49-001-01"/>
    <multimediaObject autoPlay="1" fullScreen="0" infoEntityIdent="ICN-XXX12-001-01"
                                           multimediaType="other"/>
    <multimediaObject infoEntityIdent="ICN-AAA235-000000-0-A-001-01"
                        multimediaType="3D" xlink:href="ICN-AAA235-000000-0-A-001-01.wrlzip"
 xlink:type="simple"/>

I can get the doctype inserted correctly, but I don't know how to access the entities and notations:

<xsl:template match="/">
    <xsl:text>&#xA;</xsl:text>
    <xsl:text disable-output-escaping="yes">&lt;!DOCTYPE </xsl:text>
    <xsl:value-of select="local-name(child::*)"/>
    <xsl:text> [</xsl:text> 
    <!-- entities and notations here -->  
    <xsl:text disable-output-escaping="yes">]&gt;</xsl:text>
    <xsl:text>&#xA;</xsl:text>
    <xsl:copy> 
      <xsl:apply-templates select="@* | node()"/>
    </xsl:copy>
</xsl:template>

Current output:

<!DOCTYPE dmodule []>
<dmodule xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
     xmlns:dc="http://www.purl.org/dc/elements/1.1/"
     xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
     xmlns:xlink="http://www.w3.org/1999/xlink"
     xsi:noNamespaceSchemaLocation="../schema/proced.xsd">
     <content>
        <figure>
          <title/>
            <graphic infoEntityIdent="ICN-XXX49-001-01"/>
        </figure>
        <proceduralStep>
            <para>Check the brake system function.</para>
            <multimedia>
               <title>Brake function</title>
               <multimediaObject autoPlay="1" fullScreen="0" infoEntityIdent="ICN-XXX12-001-01" multimediaType="other"/>
            </multimedia>
         </proceduralStep>
         <multimedia>
           <multimediaObject infoEntityIdent="ICN-AAA235-000000-0-A-001-01"
                    multimediaType="3D"
                    xlink:href="ICN-AAA235-000000-0-A-001-01.wrlzip"
                    xlink:type="simple"/>
            </multimedia>
     </content>

This is the inherited javascript in the stylesheet, and it does give the desired result:

<msxsl:script language="JavaScript" implements-prefix="js">

    <![CDATA[
function doctype(root) {
    var fso = new ActiveXObject('Scripting.FileSystemObject');
    var basepath = unescape(
        root
            .item(0)
            .url
            .replace(/^file:\/{3,}/, '')
            .replace(/^file:/, '')
            .replace(/[^\/]+$/, '')
            .replace(/\//g, '\\')
    );
    var entities = [];
    var notations = [];
    var needSVGNotations = false;
    if (root.item(0).doctype) {
        entities = root.item(0).doctype.entities;
        notations = root.item(0).doctype.notations;
    }
    var syntax = '\n<!DOCTYPE ' + root.item(0).documentElement.nodeName + ' [\n';
    for (var i = 0; i < entities.length; i++) {
        var entity = entities.item(i);

            var s = entity.xml;
            syntax += s + '\n';
    }
    for (var i = 0; i < notations.length; i++) {
        var s = notations.item(i).xml;
        syntax += s + '\n';
    }
    syntax += ']>\n';
    return syntax;
}
    ]]>
  </msxsl:script>

And this is the template using the javascript:

<xsl:template match="/">
    <xsl:value-of select="js:doctype(.)" disable-output-escaping="yes"/>
    <xsl:copy>
      <xsl:apply-templates select="@* | node()"/>
    </xsl:copy>
  </xsl:template>

Solution

  • Martin Honnen's suggestion to use unparsed-entity-uri() was everything as it returned all the information needed. He also tweaked my code to make it more succinct and corrected the RegEx which I originally had in analyze-string.

      <xsl:template match="/">
        <xsl:call-template name="getDocType"/>
        <xsl:copy>
          <xsl:apply-templates select="@* | node()"/>
        </xsl:copy>
      </xsl:template>
      
      <xsl:template name="getDocType">
        <xsl:text>&#xA;</xsl:text>
        <xsl:text disable-output-escaping="yes">&lt;!DOCTYPE </xsl:text>
        <!-- doctype will either be pm or dmodule -->
        <xsl:value-of select="local-name(child::*)"/>
        <xsl:text> [</xsl:text>
        <!-- get a list of all @infoEntityIdent(s). Declare as attribute()* for unparsed-entity-uri() -->
        <xsl:variable name="infoEntityIdent" as="attribute()*" select="(descendant::symbol | descendant::barCode | descendant::multimedia/multimediaObject | descendant::graphic)/@infoEntityIdent"/>
        <xsl:text>&#xA;</xsl:text>
        
        <!-- write out the entity declaration -->
        <xsl:for-each select="$infoEntityIdent">
          <xsl:text disable-output-escaping="yes">&lt;!ENTITY </xsl:text>
          <xsl:value-of select="."/>
          <xsl:text> SYSTEM "</xsl:text>
          <xsl:variable name="uri" as="xs:anyURI" select="unparsed-entity-uri(.)"/>
          <!-- remove everything before, and including, the Original directory, leaving any graphics directory -->      
          <xsl:value-of select="replace($uri,'^.*Original/(.*)$','$1')"/>
          <xsl:text>" NDATA </xsl:text>
          <!-- print out the extension -->
          <xsl:value-of select="replace($uri, '.*?([^.]+)$', '$1')"/>
          <!-- close the declaration -->
          <xsl:text disable-output-escaping="yes">&gt;</xsl:text>
          <xsl:text>&#xA;</xsl:text>
        </xsl:for-each>
        
        <!-- get a list of notations -->
        <xsl:variable name="notations" select="$infoEntityIdent ! unparsed-entity-uri(.) ! replace(., '.*?([^.]+)$', '$1')"/>
        <xsl:for-each select="distinct-values($notations)">
          <xsl:choose>
            <xsl:when test="matches(.,'JPE?G','i')">
              <xsl:text disable-output-escaping="yes" expand-text="1">&lt;!NOTATION {.} PUBLIC "+//ISBN 0-7923-9432-1::Graphic Notation//NOTATION Joint Photographic Experts Group Raster//EN"&gt;</xsl:text>
            </xsl:when>
            <xsl:when test="matches(.,'cgm', 'i')">
              <xsl:text disable-output-escaping="yes" expand-text="1">&lt;!NOTATION {.} PUBLIC "-//USA-DOD//NOTATION Computer Graphics Metafile//EN"&gt;</xsl:text>
            </xsl:when>
            <xsl:when test="matches(.,'wrlzip', 'i')">
              <xsl:text disable-output-escaping="yes" expand-text="1">&lt;!NOTATION {.} SYSTEM "WRLZIP"&gt;</xsl:text>
            </xsl:when>
            <xsl:when test="matches(.,'svg', 'i')">
              <xsl:text disable-output-escaping="yes" expand-text="1">&lt;!NOTATION {.} SYSTEM "SVG"&gt;</xsl:text>
            </xsl:when>
            <xsl:when test="matches(.,'tiff', 'i')">
              <xsl:text disable-output-escaping="yes" expand-text="1">&lt;!NOTATION {.} SYSTEM "TIFF"&gt;</xsl:text>
            </xsl:when>
            <xsl:when test="matches(.,'png', 'i')">
              <xsl:text disable-output-escaping="yes" expand-text="1">&lt;!NOTATION {.} PUBLIC "-//W3C//NOTATION Portable Network Graphics//EN"&gt;</xsl:text>
            </xsl:when>
            <xsl:when test="matches(.,'swf', 'i')">
              <xsl:text disable-output-escaping="yes" expand-text="1">&lt;!NOTATION {.} PUBLIC "-//S1000D//NOTATION X-SHOCKWAVE-FLASH 3D Models Encoding//EN"&gt;</xsl:text>
            </xsl:when>
          </xsl:choose>
          <xsl:text>&#xA;</xsl:text>
        </xsl:for-each>    
        <xsl:text disable-output-escaping="yes">]&gt;</xsl:text>
        <xsl:text>&#xA;</xsl:text>
        <xsl:text>&#xA;</xsl:text>
      </xsl:template>