Search code examples
pythonlxmlmathml

How to read mathML with entity references using lxml in python


I am trying to read a MathML file using lxml

import lxml.etree as et

#function that reads the xhtml file and returns the root of the document
def getData(fname):
    f=open(fname)
    d=f.read()
    data = d.replace('\n','')
    root = et.fromstring(data)
    return root

def main():
    r = getData('integral_example.xhtml')
    print et.tostring(r)

if __name__ == '__main__':
    main()

integral_example.xhtml:

<!-- begin MathToWeb -->
<!-- (your LaTeX) $ \int_a^b f(x)\,dx. $ -->
<!DOCTYPE math SYSTEM "http://www.w3.org/Math/DTD/mathml1/mathml.dtd">
<math>
<mrow> 
    <mstyle displaystyle="true">
        <munderover>
            <mo>&int;</mo>
            <mi>a</mi>
            <mi>b</mi>
        </munderover>
    </mstyle>
    <mi>f</mi>
    <mrow>
        <mo maxsize="1.00em" form="prefix">(</mo>
        <mi>x</mi>
        <mo maxsize="1.00em" form="postfix">)</mo>
    </mrow>
    <mspace width="0.167em" />
    <mo>&dd;</mo>
    <mi>x</mi>
    <mo>.</mo>
</mrow>
</math>
<!-- end MathToWeb -->

the so_question.py is the python code given above. The output when I run is given below:

Venkateshs-MacBook-Pro:mathml_examples venkatesh$ python so_question.py

Traceback (most recent call last):
  File "so_question.py", line 14, in <module>
    main()
  File "so_question.py", line 11, in main
    r = getData('integral_example.xhtml')
  File "so_question.py", line 7, in getData
    root = et.fromstring(data)
  File "lxml.etree.pyx", line 3003, in lxml.etree.fromstring (src/lxml/lxml.etree.c:67277)
  File "parser.pxi", line 1785, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:101626)
  File "parser.pxi", line 1673, in lxml.etree._parseDoc (src/lxml/lxml.etree.c:100455)
  File "parser.pxi", line 1074, in lxml.etree._BaseParser._parseDoc (src/lxml/lxml.etree.c:95637)
  File "parser.pxi", line 582, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:90446)
  File "parser.pxi", line 683, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:91632)
  File "parser.pxi", line 622, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:90928)
lxml.etree.XMLSyntaxError: Entity 'int' not defined, line 1, column 197

Where am I going wrong?


Solution

  • To load the entities from the DTD, use the load_dtd=True option when creating the parser. Also set no_network=False to allow network access when looking up external documents (in this case, http://www.w3.org/Math/DTD/mathml1/mathml.dtd).

    import lxml.etree as ET
    
    # function that reads the xhtml file and returns the root of the document
    
    
    def getData(fname):
        with open(fname) as f:
            parser = ET.XMLParser(load_dtd=True, no_network=False) 
            doc = ET.parse(f, parser=parser)
            return doc.getroot()
    
    
    def main():
        r = getData('integral_example.xhtml')
        print ET.tostring(r)
    
    if __name__ == '__main__':
        main()
    

    yields

    <!DOCTYPE math SYSTEM "http://www.w3.org/Math/DTD/mathml1/mathml.dtd">
    <math>
    <mrow> 
        <mstyle displaystyle="true">
            <munderover>
                <mo>&#8747;</mo>
                <mi>a</mi>
                <mi>b</mi>
            </munderover>
        </mstyle>
        <mi>f</mi>
        <mrow>
            <mo maxsize="1.00em" form="prefix">(</mo>
            <mi>x</mi>
            <mo maxsize="1.00em" form="postfix">)</mo>
        </mrow>
        <mspace width="0.167em"/>
        <mo>&#63308;</mo>
        <mi>x</mi>
        <mo>.</mo>
    </mrow>
    </math><!-- end MathToWeb -->
    

    If you don't want to resolve the entities, then you could use

    parser = ET.XMLParser(resolve_entities=False)
    

    instead; it's quicker, but

    <mo>&int;</mo>
    

    is not changed to

    <mo>&#8747;</mo>