Search code examples
javaxmlsaxdtdjdom

Using JDOM to read and write internal DTDs


This is a followup to the question Is there some equivalent in Java to Ruby's Nokogiri::XML::EntityDecl?

I have a simple DAISY DTBook XML file (although the particular DTD isn't important to my question, this is an actual standard used in older talking books.) It contains XML from both the DTBook and MathML namespaces.

Note that the DTD declaration follows the convention that I copied from the specification for MathML in DAISY, where it uses a combined DTD, referring both to an external DTD for the DTBook standard and adding some internal ENTITY definitions for the MathML standard.

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE dtbook PUBLIC "-//NISO//DTD dtbook 2005-2//EN"
 "http://www.daisy.org/z3986/2005/dtbook-2005-2.dtd"
 [
  <!ENTITY % MATHML.prefixed "INCLUDE" >
  <!ENTITY % MATHML.prefix "m">
  <!ENTITY % MATHML.Common.attrib
          "xlink:href    CDATA       #IMPLIED
          xlink:type     CDATA       #IMPLIED
          class          CDATA       #IMPLIED
          style          CDATA       #IMPLIED
          id             ID          #IMPLIED
          xref           IDREF       #IMPLIED
          other          CDATA       #IMPLIED
          xmlns:dtbook   CDATA       #FIXED 'http://www.daisy.org/z3986/2005/dtbook/'
          dtbook:smilref CDATA       #IMPLIED"
  >
  <!ENTITY % mathML2 PUBLIC "-//W3C//DTD MathML 2.0//EN"
             "http://www.w3.org/Math/DTD/mathml2/mathml2.dtd"
  >
  %mathML2;
  <!ENTITY % externalFlow "| m:math">
  <!ENTITY % externalNamespaces "xmlns:m CDATA #FIXED
    'http://www.w3.org/1998/Math/MathML'">
 ]
>
<dtbook xmlns="http://www.daisy.org/z3986/2005/dtbook/" xmlns:m="http://www.w3.org/1998/Math/MathML"
    version="2005-2" xml:lang="eng">
    <head></head>
    <book>
        <frontmatter><doctitle></doctitle></frontmatter>
        <bodymatter>
            <level1>
            <p>Test</p>
                <m:math xmlns:dtbook="http://www.daisy.org/z3986/2005/dtbook/"
                    id="math0001" dtbook:smilref="nativemathml.smil#math0001" altimg="nativemathml0001.png"
                    alttext="sigma-summation UnderScript i equals zero OverScript infinity EndScripts x Subscript i">
                    <m:mrow>
                        <m:mstyle displaystyle='true'>
                            <m:munderover>
                                <m:mo>&#x2211;</m:mo>
                                <m:mrow>
                                    <m:mi>i</m:mi>
                                    <m:mo>=</m:mo>
                                    <m:mn>0</m:mn>
                                </m:mrow>
                                <m:mi>&#x221E;</m:mi>
                            </m:munderover>
                            <m:mrow>
                                <m:msub>
                                    <m:mi>x</m:mi>
                                    <m:mi>i</m:mi>
                                </m:msub>
                            </m:mrow>
                        </m:mstyle>
                    </m:mrow>
                </m:math>
            </level1>
        </bodymatter>
        <rearmatter><level1><p></p></level1></rearmatter>
    </book>
</dtbook>

I used the following Java code to read in the document and print it back out. I first used JDOM 1.1.3 (because of constraints of the larger project this is for) but I did also try it with JDOM 2.0.6.

@Test
public void buildDTD2()
        throws IOException, JDOMException
{
    final PathMatchingResourcePatternResolver pmrpr = new PathMatchingResourcePatternResolver();
    final File file = pmrpr.getResource("daisy/mathmldtdtemplate.xml").getFile();
    final String uri = file.toURI().toString();
    final InputStream stream = new BufferedInputStream(new FileInputStream(file));
    final SAXBuilder saxBuilder = new SAXBuilder();

    saxBuilder.setValidation(true);
    saxBuilder.setFeature("http://apache.org/xml/features/validation/schema", true);

    final InputSource source = new InputSource(new BufferedInputStream(stream));
    source.setSystemId(uri);
    final Document doc = saxBuilder.build(source);

    String xml2 = new XMLOutputter().outputString(doc);
    System.out.println(xml2);
    System.out.println("Internal Subset: " + doc.getDocType().getInternalSubset());
}

When I use System.out.println to print out getInternalSubset() on the last line, nothing is printed. When I print out the entire document I get this:

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE dtbook PUBLIC "-//NISO//DTD dtbook 2005-2//EN" "http://www.daisy.org/z3986/2005/dtbook-2005-2.dtd">
<dtbook xmlns="http://www.daisy.org/z3986/2005/dtbook/" xmlns:m="http://www.w3.org/1998/Math/MathML" version="2005-2" xml:lang="eng">
    <head />
    <book>
        <frontmatter><doctitle /></frontmatter>
        <bodymatter>
            <level1>
            <p>Test</p>
                <m:math xmlns:dtbook="http://www.daisy.org/z3986/2005/dtbook/" id="math0001" dtbook:smilref="nativemathml.smil#math0001" altimg="nativemathml0001.png" alttext="sigma-summation UnderScript i equals zero OverScript infinity EndScripts x Subscript i" overflow="scroll">
                    <m:mrow>
                        <m:mstyle displaystyle="true">
                            <m:munderover>
                                <m:mo>∑</m:mo>
                                <m:mrow>
                                    <m:mi>i</m:mi>
                                    <m:mo>=</m:mo>
                                    <m:mn>0</m:mn>
                                </m:mrow>
                                <m:mi>∞</m:mi>
                            </m:munderover>
                            <m:mrow>
                                <m:msub>
                                    <m:mi>x</m:mi>
                                    <m:mi>i</m:mi>
                                </m:msub>
                            </m:mrow>
                        </m:mstyle>
                    </m:mrow>
                </m:math>
            </level1>
        </bodymatter>
        <rearmatter><level1><p /></level1></rearmatter>
    </book>
</dtbook>

The ENTITY definitions are gone! Have I missed some option that would allow me to maintain them? How can I maintain them? When we process these files we may need to read them in and write them out several times without losing this DTD.


Solution

  • After further research, I found a solution on the jdom-interest list.

    Add the statement saxBuilder.setExpandEntities(false); which according to Laurent Bihanic, will force the registration of the DeclHandler.

    @Test
    public void buildDTD2()
            throws IOException, JDOMException
    {
        final PathMatchingResourcePatternResolver pmrpr = new PathMatchingResourcePatternResolver();
        final File file = pmrpr.getResource("daisy/mathmldtdtemplate.xml").getFile();
        final String uri = file.toURI().toString();
        final InputStream stream = new BufferedInputStream(new FileInputStream(file));
        final SAXBuilder saxBuilder = new SAXBuilder();
    
        saxBuilder.setValidation(true);
        saxBuilder.setFeature("http://apache.org/xml/features/validation/schema", true);
    
        saxBuilder.setExpandEntities(false);
    
        final InputSource source = new InputSource(new BufferedInputStream(stream));
        source.setSystemId(uri);
        final Document doc = saxBuilder.build(source);
    
        String xml2 = new XMLOutputter().outputString(doc);
        System.out.println(xml2);
        System.out.println("Internal Subset: " + doc.getDocType().getInternalSubset());
    }
    

    This works; now the internal subset is read in and printed out after "Internal Subset:".