Search code examples
javaxmlxpathxml-namespaces

Parsing XML with multiple namespaces with xPath in Java


I am trying to parse a XML document that has two xmlns namespaces and all my xPath queries are returning null.

I want to parse the values of category nodes and create an array but because the document has two namespaces, no matter what xpath expression I use, it always returns null.

If I remove one namespace then it works fine. I have looked up other answers but couldn't find something that works so posting this as a new question.

Here's what I have tried so far. I am using this article as a reference.

Thanks for your help in advance.

import java.io.FileInputStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import javax.xml.XMLConstants;
import javax.xml.namespace.NamespaceContext;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;

import org.xml.sax.InputSource;

import org.w3c.dom.Document;
import org.w3c.dom.NodeList;


class Main
{
    public static void main(String[] args) throws Exception
    {

        //Parse XML file
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        factory.setNamespaceAware(true);
        DocumentBuilder builder = factory.newDocumentBuilder();
        Document doc = builder.parse(new InputSource(new StringReader("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
                "<newsItem guid=\"urn:newsml:news.com.au:20210401\" version=\"1\"\n" +
                "  standard=\"NewsML-G2\" standardversion=\"2.9\"\n" +
                "  xmlns=\"http://iptc.org/std/nar/2006-10-01/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">\n" +
                "  <catalogRef href=\"http://www.iptc.org/std/catalog/catalog.IPTC-G2-Standards_16.xml\"/>\n" +
                "  <itemMeta>\n" +
                "    <itemClass qcode=\"ninat:video\"/>\n" +
                "    <provider>\n" +
                "      <name>FoxSports</name>\n" +
                "    </provider>\n" +
                "    <versionCreated>2021-04-01T16:10:15.736+11:00</versionCreated>\n" +
                "    <event>create</event>\n" +
                "  </itemMeta>\n" +
                "  <contentMeta>\n" +
                "    <FWID>0</FWID>\n" +
                "    <originalId>799186</originalId>\n" +
                "    <contentCreated>2021-04-01T16:10:15.736+11:00</contentCreated>\n" +
                "    <expiration>2021-05-01T15:00:43.057+10:00</expiration>\n" +
                "    <slugline>Test - Video Name</slugline>\n" +
                "    <headline>Test - video headline</headline>\n" +
                "    <description>Test AFL: David King breaks down his new theory surrounding Dimma and the Tigers. </description>\n" +
                "    <category>\n" +
                "      <id>208</id>\n" +
                "      <name>AFL</name>\n" +
                "      <category>\n" +
                "        <id>320</id>\n" +
                "        <name>AFL 360</name>\n" +
                "      </category>\n" +
                "    </category>\n" +
                "    <collections>\n" +
                "      <collection>\n" +
                "        <id>138</id>\n" +
                "        <name>alexa</name>\n" +
                "      </collection>\n" +
                "    </collections>\n" +
                "    <isPremiumPay>false</isPremiumPay>\n" +
                "    <geoblock>false</geoblock>\n" +
                "  </contentMeta>\n" +
                "  <contentSet>\n" +
                "    <remoteContent id=\"web\"\n" +
                "      href=\"DIMMA'S_GAMBIT_2021_01_04_04_55_09.jpg\" version=\"1\"\n" +
                "      rendition=\"rnd:web\" size=\"44848\" contenttype=\"image/jpeg\"\n" +
                "      width=\"640\" height=\"360\" colourspace=\"colsp:sRGB\" orientation=\"1\" resolution=\"96\"/>\n" +
                "    <remoteContent href=\"DIMMA'S_GAMBIT_2021_01_04_04_55_09_564.mp4\"\n" +
                "      contenttype=\"video/mp4\" width=\"512\" height=\"288\" duration=\"121\"\n" +
                "      audiobitrate=\"64000\" videoavgbitrate=\"500000\" videoaspectratio=\"16:9\"/>\n" +
                "    <remoteContent href=\"DIMMA'S_GAMBIT_2021_01_04_04_55_09_248.mp4\"\n" +
                "      contenttype=\"video/mp4\" width=\"512\" height=\"288\" duration=\"121\"\n" +
                "      audiobitrate=\"48000\" videoavgbitrate=\"200000\" videoaspectratio=\"16:9\"/>\n" +
                "    <remoteContent href=\"DIMMA'S_GAMBIT_2021_01_04_04_55_09_1596.mp4\"\n" +
                "      contenttype=\"video/mp4\" width=\"800\" height=\"450\" duration=\"121\"\n" +
                "      audiobitrate=\"96000\" videoavgbitrate=\"1500000\" videoaspectratio=\"16:9\"/>\n" +
                "    <remoteContent href=\"DIMMA'S_GAMBIT_2021_01_04_04_55_09_2628.mp4\"\n" +
                "      contenttype=\"video/mp4\" width=\"1280\" height=\"720\" duration=\"121\"\n" +
                "      audiobitrate=\"128000\" videoavgbitrate=\"2500000\" videoaspectratio=\"16:9\"/>\n" +
                "    <remoteContent href=\"DIMMA'S_GAMBIT_2021_01_04_04_55_09_1096.mp4\"\n" +
                "      contenttype=\"video/mp4\" width=\"640\" height=\"360\" duration=\"121\"\n" +
                "      audiobitrate=\"96000\" videoavgbitrate=\"1000000\" videoaspectratio=\"16:9\"/>\n" +
                "    <remoteContent href=\"DIMMA'S_GAMBIT_2021_01_04_04_55_09_896.mp4\"\n" +
                "      contenttype=\"video/mp4\" width=\"640\" height=\"360\" duration=\"121\"\n" +
                "      audiobitrate=\"96000\" videoavgbitrate=\"800000\" videoaspectratio=\"16:9\"/>\n" +
                "  </contentSet>\n" +
                "</newsItem>\n")));

        //Get XPath expression
        XPathFactory xpathfactory = XPathFactory.newInstance();
        XPath xpath = xpathfactory.newXPath();
        xpath.setNamespaceContext(new NamespaceResolver(doc));
        XPathExpression expr = xpath.compile("/newsItem/itemMeta");

        //Search XPath expression
        Object result = expr.evaluate(doc, XPathConstants.NODESET);

        //Iterate over results and fetch book names
        NodeList nodes = (NodeList) result;
        for (int i = 0; i < nodes.getLength(); i++) {
            System.out.println(nodes.item(i).getNodeValue());
        }
        
    }
}
class NamespaceResolver implements NamespaceContext
{
    //Store the source document to search the namespaces
    private Document sourceDocument;

    public NamespaceResolver(Document document) {
        sourceDocument = document;
    }

    //The lookup for the namespace uris is delegated to the stored document.
    public String getNamespaceURI(String prefix) {
        if (prefix.equals(XMLConstants.DEFAULT_NS_PREFIX)) {
            return sourceDocument.lookupNamespaceURI(null);
        }  else {
            return sourceDocument.lookupNamespaceURI(prefix);
        }
    }

    public String getPrefix(String namespaceURI) {
        return sourceDocument.lookupPrefix(namespaceURI);
    }

    @SuppressWarnings("rawtypes")
    public Iterator getPrefixes(String namespaceURI) {
        return null;
    }
}```


Solution

  • Your XML elements are bound to the namespace http://iptc.org/std/nar/2006-10-01/, but your XPath is not using any namespace-prefixes, so /newsItem/itemMeta is asking for elements that are bound to no namespace.

    You could address them by just the local-name():

    /*[local-name()='newsItem']/*[local-name()='itemMeta']
    

    Otherwise, you need to register the namespace with a namespace prefix, or use a custom NamespaceContext to resolve the namespace from your chosen namespace-prefix:

    xpath.setNamespaceContext(new NamespaceContext() {
        public String getNamespaceURI(String prefix) {
          switch (prefix) {
            case "i": return "http://iptc.org/std/nar/2006-10-01/";
            // ...
           }
        });
    

    and then use that namespace-prefix in your XPath:

    /i:newsItem/i:itemMeta