Search code examples
javaxml

XML API or open source jar for extract a subset of XML based on a given path


I am wondering if there are APIs or open source jar that can extract a subset of XML based on a given path.

For example: I have an XML which is a skeleton (yin model, which is converted from yang model)

<xml .....>
<data>
   <model1>
     <element1>
        <id />
        <name />
        <address />
     </element1>
   </model1>
   <model2>
     <element2>
        <uid />
        <something />
     </element2>
   </model2>
   ....
</data>

a given path:

data/model1/element1[id='1']/name  and  name value is 'John'

and I want the following to be returned

<xml .....>
<data>
   <model1>
     <element1>
        <id>1</id>
        <name>John</name>
     </element1>
   </model1>
<data>

I am not quite sure what keywords to search for. Hopefully, someone knows XML well enough could give suggestions.

Another question is if there's no existing API or open source, what would be the best way to handle this? Should I use DOM as I need the whole (tree) structure from my skeleton? Besides DOM is using too much memory, what are the other side effects?


Solution

  • You can use the builtin package javax.xml to read and write data. You can query the XML using XML path language (XPath). For example, extracting the subtree of <element1>:

    /data/model1/element1
    

    Or extracting the subtree of <element1> where child-elements <id> has text "1":

    /data/model1/element1[id/text() = 1] 
    

    I wrote a small program to demonstrate the usage. You need to

    • create a org.w3c.dom.Document
    • parse the XML content into this object
    • compile your XPath expression
    • extract the document using the compiled xpath as a NodeList
    • export the NodeList or do any other desired tasks.

    You can compile the program and run as follows:

    $ javac Demo.java
    $ java Demo /data/model1/element1
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <data>
      <model1>
        <element1>
    
          <id>1</id>
    
          <name>John</name>
    
          <address>xxx</address>
    
        </element1>
        <element1>
    
          <id>2</id>
    
          <name>Tom</name>
    
          <address>yyy</address>
    
        </element1>
      </model1>
    </data>
    
    ~ $ java Demo '/data/model1/element1[id/text() = 1]'
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <data>
      <model1>
        <element1>
    
          <id>1</id>
    
          <name>John</name>
    
          <address>xxx</address>
    
        </element1>
      </model1>
    </data>
    

    The full program:

    import java.io.*;
    import java.nio.charset.StandardCharsets;
    import javax.xml.parsers.DocumentBuilderFactory;
    import javax.xml.transform.*;
    import javax.xml.transform.dom.DOMSource;
    import javax.xml.transform.stream.StreamResult;
    import javax.xml.xpath.*;
    import org.w3c.dom.*;
    
    public class Demo {
    
      private static final String XML =
          "<?xml version=\"1.0\"?>\n"
              + "<data>\n"
              + "  <model1>\n"
              + "    <element1>\n"
              + "      <id>1</id>\n"
              + "      <name>John</name>\n"
              + "      <address>xxx</address>\n"
              + "    </element1>\n"
              + "    <element1>\n"
              + "      <id>2</id>\n"
              + "      <name>Tom</name>\n"
              + "      <address>yyy</address>\n"
              + "    </element1>\n"
              + "  </model1>\n"
              + "  <model2>\n"
              + "    <element2>\n"
              + "      <uid />\n"
              + "      <something />\n"
              + "    </element2>\n"
              + "  </model2>"
              + "</data>";
    
      public static void main(String[] args) throws Exception {
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        Document source;
        try (InputStream in = new ByteArrayInputStream(XML.getBytes(StandardCharsets.UTF_8))) {
          source = factory.newDocumentBuilder().parse(in);
        }
    
        // Extract
        XPath xPath = XPathFactory.newInstance().newXPath();
        XPathExpression expr = xPath.compile(args[0]);
    
        NodeList nodeList = (NodeList) expr.evaluate(source, XPathConstants.NODESET);
    
        // Export
        Document target = factory.newDocumentBuilder().newDocument();
        Element data = target.createElement("data");
        Element model1 = target.createElement("model1");
        data.appendChild(model1);
        target.appendChild(data);
        for (int i = 0; i < nodeList.getLength(); i++) {
          Node node = nodeList.item(i);
          Node newNode = target.importNode(node, true);
          model1.appendChild(newNode);
        }
        System.out.println(getStringFrom(target));
      }
    
      private static String getStringFrom(Document doc) throws TransformerException {
        DOMSource domSource = new DOMSource(doc);
        StringWriter writer = new StringWriter();
        StreamResult result = new StreamResult(writer);
        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer transformer = tf.newTransformer();
        // set indent
        transformer.setOutputProperty(OutputKeys.INDENT, "yes");
        transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");
        transformer.transform(domSource, result);
        return writer.toString();
      }
    }