Search code examples
xmlxpathvtd-xml

VTD-XML - Not able to get text after span tag


<?xml version="1.0"?>
<catalog>
    <book id="bk001" type='fiction'>
        <author>Gambardella, Matthew</author>
        <author>Doe, John</author>
        <title>XML IN-DEPT Developer's Guide</title>
        <genre>Computer</genre>
        <price>44.95</price>
        <snippet>
            <inlineXML contenttype="application/xhtml+xml" >
                <html lang="en-US" >
                    <head>
                        <title>XML IN-DEPT Developer's Guide</title>
                    </head>
                    <body>
                        <p>This is an example book for developers want to gain knowledge on  <span class="boldcls" type="xml" >XML</span> Marshalling and UnMarshalling. Need to know all about <span class="boldcls" type="tech" >XML parsing and editing</span>, Grab this Book!</p>
                    </body>
                </html>
            </inlineXML>
        </snippet>
    </book>
</catalog>

Above is the XML sample, i want to evaluate XPath Expression "/book/snippet" and iterate over all elements and get text. I am using this(https://stackoverflow.com/a/21279523/1297935) modified code(as in the UPDATE below) using VTD-XML library, but the problem is that it does not get me the text after it encounters span tag. So the output i get now for the paragraph tag is:

    Level [6] Tag [p]   
            This is an example book for developers want to gain knowledge on
    Level [7] Tag [span] @class=boldcls
            XML
    Level [8] Tag [span] @class=boldcls
            XML parsing and editing

Which is wrong, as it should be:

    Level [6] Tag [p]   
            This is an example book for developers want to gain knowledge on XML Marshalling and UnMarshalling. Need to know all about XML parsing and editing, Grab this Book!
    Level [7] Tag [span] @class=boldcls
            XML
    Level [8] Tag [span] @class=boldcls
            XML parsing and editing

UPDATE: I have modified the example code a bit:

package com.vtd.test;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.w3c.dom.Document;

import com.ximpleware.AutoPilot;
import com.ximpleware.NavException;
import com.ximpleware.VTDGen;
import com.ximpleware.VTDNav;
import com.ximpleware.XPathEvalException;
import com.ximpleware.XPathParseException;

public class VTDXMLReader {

    // private String xpathExpression;

    private VTDNav vtdNav;

    private AutoPilot autoPilot;

    private boolean includeAttributes;

    private String attribute;

    public VTDXMLReader(final Document storyDoc, final boolean includeAttributes, final String xpathExpression) {
        this.includeAttributes = includeAttributes;
        // this.xpathExpression = xpathExpression;
        final VTDGen vtdGen = new VTDGen();
        try {
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            TransformerFactory transformerFactory = TransformerFactory.newInstance();
            Transformer transformer = transformerFactory.newTransformer();
            DOMSource source = new DOMSource(storyDoc);
            StreamResult result = new StreamResult(baos);
            transformer.transform(source, result);
            byte[] array = baos.toByteArray();

            vtdGen.setDoc(array);
            vtdGen.parse(true);
        } catch (Exception ex) {
            ex.printStackTrace();
        }
        vtdNav = vtdGen.getNav();
        autoPilot = new AutoPilot(vtdNav);
        String[] xpathFrags = xpathExpression.split("/");
        if (xpathFrags[xpathFrags.length - 1].startsWith("@")) {
            attribute = xpathFrags[xpathFrags.length - 1].replaceAll("@", "");
        }
        try {
            autoPilot.selectXPath(xpathExpression);
        } catch (XPathParseException e) {
            e.printStackTrace();
        }
    }

    public List<String> readXML() throws IOException {
        List<String> values = new ArrayList<String>();
        try {
            while (autoPilot.evalXPath() != -1) {
                // printTag(vn);
                if (includeAttributes) {
                    Map<String, String> amap = new LinkedHashMap<String, String>();

                    loadAttributeMap(vtdNav, amap);

                    for (String aname : amap.keySet()) {
                        String aval = amap.get(aname);
                        values.add(aval);
                        // System.out.print(" @" + aname + "=" + aval);
                    }
                    // System.out.print("\n");
                }
                int val = 0;
                if (attribute != null && !attribute.isEmpty()) {
                    val = vtdNav.getAttrVal(attribute);
                    if (val != -1) {
                        String id = vtdNav.toNormalizedString(val);
                        values.add(id);
                        // System.out.println("Attribute: " + id);
                    }
                }
                val = vtdNav.getText();
                if (val != -1) {
                    String author = vtdNav.toNormalizedString(val);
                    values.add(author);
                    // System.out.println("\t" + author);
                }
                navigateToChildren(vtdNav, includeAttributes, values);

            }
            // autoPilot.resetXPath();
        } catch (Exception ex) {
            ex.printStackTrace();
        }

        return values;

    }

    public static void navigateToChildren(final VTDNav vn, final boolean includeAttributes, List<String> values) {
        try {
            vn.push();
            if (vn.toElement(VTDNav.FIRST_CHILD)) {
                do {
                    // printTag(vn);

                    if (includeAttributes) {
                        Map<String, String> amap = new LinkedHashMap<String, String>();

                        loadAttributeMap(vn, amap);

                        for (String aname : amap.keySet()) {
                            String aval = amap.get(aname);
                            values.add(aval);
                            // System.out.print(" @" + aname + "=" + aval);
                        }
                        // System.out.print("\n");
                    }

                    int val = vn.getText();
                    if (val != -1) {
                        String author = vn.toNormalizedString(val);
                        values.add(author);
                        // System.out.println("\t" + author);
                    }
                    navigateToChildren(vn, includeAttributes, values);
                } while (vn.toElement(VTDNav.NEXT_SIBLING));
            }
            vn.toElement(VTDNav.PARENT);
            vn.pop();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    private static void loadAttributeMap(VTDNav nav, Map<String, String> amap) {

        nav.push();

        try {
            AutoPilot apAtt = new AutoPilot(nav);
            apAtt.selectXPath("@*");

            int j = -1;
            while ((j = apAtt.evalXPath()) != -1) {
                String name = nav.toString(j);
                String val = nav.toString(j + 1);

                amap.put(name, val);
            }
        } catch (XPathParseException | XPathEvalException | NavException e) {
            e.printStackTrace();
        }

        nav.pop();
    }

    public static void main(String[] args) {
        try {
            DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
            Document document = dBuilder.parse(new File("books.xml"));

            VTDXMLReader vtdxmlReader = new VTDXMLReader(document, false, "/catalog/book/snippet");
            List<String> xmlFrags = vtdxmlReader.readXML();
            for (String xmlFrag : xmlFrags) {
                System.out.println(xmlFrag);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}

Output of the above code is:

XML IN-DEPT Developer's Guide
This is an example book for developers want to gain knowledge on
XML
XML parsing and editing

Which should have been:

XML IN-DEPT Developer's Guide
This is an example book for developers want to gain knowledge on
XML
Marshalling and UnMarshalling. Need to know all about
XML parsing and editing
, Grab this Book!

Any ideas?

What i want to do: if following is the paragraph tag in an html Document:

<p>This is an example book for developers want to gain knowledge on  <span class="boldcls" type="xml" >XML</span> Marshalling and UnMarshalling. Need to know all about <span class="boldcls" type="tech" >XML parsing and editing</span>, Grab this Book!</p>

i want to write a Reader which reads it from left to right including the attribute values, like following line by line:

==> This is an example book for developers want to gain knowledge on
==> boldcls xml XML
==> Marshalling and UnMarshalling. Need to know all about
==> boldcls tech XML parsing and editing
==> , Grab this Book!

Currently i am doing this using XMLEventReader which i want to replace with VTD-XML library code.


Solution

  • I made a slight mod to your navigateToChildren subroutine... I called VTDNav's getXPathStringVal() to get all the text nodes... basically, the problem is getText() which works fine for data centric xml docs... for document centric use cases, you should call getXPathStringVal() method to extract text node directly... this method is available in newer version of vtd-xml. is this what you are looking for?

    public static void navigateToChildren(final VTDNav vn, final boolean includeAttributes, List<String> values) {
            try {
                vn.push();
                if (vn.toElement(VTDNav.FIRST_CHILD)) {
                    do {
                        //printTag(vn);
    
                        if (includeAttributes) {
                            Map<String, String> amap = new LinkedHashMap<String, String>();
    
                            loadAttributeMap(vn, amap);
    
                            for (String aname : amap.keySet()) {
                                String aval = amap.get(aname);
                                values.add(aval);
                                System.out.print(" ==>@" + aname + "=" + aval);
                            }
                            // System.out.print("\n");
                        }
    
                        int val = vn.getText();
    
                        if (val != -1) {
                            String author = vn.getXPathStringVal();
                            values.add(author);
                            System.out.println("==>\t" + author);
                        }
                        navigateToChildren(vn, includeAttributes, values);
                    } while (vn.toElement(VTDNav.NEXT_SIBLING));
                }
                vn.toElement(VTDNav.PARENT);
                vn.pop();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    

    Second edit: I wrote a small app that does all the subordinate text and attr val concatenation.. basically it accesses the underlying VTD buffers directly using index value... and scans thru the VTD records sequentially. If the token type is attr val or character data, the app appends it to the string buffer...

    import com.ximpleware.*;
    
    public class collectTokens {
        public static void main(String[] s) throws VTDException{
            VTDGen vg = new VTDGen();
            if (!vg.parseFile("d:\\xml\\books.xml", true)){
                return;
            }
            VTDNav vn = vg.getNav();
            AutoPilot ap = new AutoPilot(vn);
            ap.selectXPath("/catalog/book/snippet/inlineXML/html/body/p");
            int i=ap.evalXPath();
            // i points to the p element node
            if (i!=-1){
                int j = vn.getCurrentIndex();// get the token index of p
                int d = vn.getTokenDepth(j);
                int count = vn.getTokenCount();
                int index=j+1;
                // collect the text of all text and attr vals  sequentially
                StringBuilder sb = new StringBuilder(50);
                while((index<count)){
                    if (vn.getTokenDepth(index)==d 
                            && vn.getTokenDepth(index)== VTDNav.TOKEN_STARTING_TAG)
                        break;
                    if (vn.getTokenType(index)== VTDNav.TOKEN_CHARACTER_DATA
                            || vn.getTokenType(index)==VTDNav.TOKEN_ATTR_VAL){
                                sb.append(vn.toString(index)+" ");
                            }
                    index++;
                }
                System.out.println(sb);
            }
        }
    }