Search code examples
javaxmlfilesplitting

Splitting a larger size XML file using Java (Retaining Parent's attributes and Siblings)


Consider the XML file, Report.xml :

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
   <Report FileName="abc.bin" reportDate="05/12/2016 02:44:22 AM">
      <Statistics>
        <child value="abc">
         <subchild>...</subchild>
        </child>
        <child value="xyz">
         <subchild>...</subchild>
        </child>
      </Statistics>
      <Properties>
        <child1>...</child1>
        <child2>...</child2>
        .
        .
        .
        <childn>...</childn>
      </Properties>
      <OverallStatistics>
        <child1>...</child1>
        <child2>...</child2>
        .
        .
        .
        <childn>...</child1>
      </OverallStatistics>
  </Report>

I just want to split the above XML file as:

ReportSplit1.xml

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
 <Report FileName="abc.bin" reportDate="05/12/2016 02:44:22 AM">
   <Statistics>
      <child value="abc">
         <subchild>...</subchild>
      </child>
   </Statistics>
   <Properties>
        <child1>...</child1>
        <child2>...</child2>
        .
        .
        .
        <childn>...</childn>
   </Properties>
   <OverallStatistics>
        <child1>...</child1>
        <child2>...</child2>
        .
        .
        .
        <childn>...</child1>
    </OverallStatistics>
</Report>

ReportSplit2.xml

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
 <Report FileName="abc.bin" reportDate="05/12/2016 02:44:22 AM">
   <Statistics>
      <child value="xyz">
         <subchild>...</subchild>
      </child>
   </Statistics>
   <Properties>
        <child1>...</child1>
        <child2>...</child2>
        .
        .
        .
        <childn>...</childn>
   </Properties>
   <OverallStatistics>
        <child1>...</child1>
        <child2>...</child2>
        .
        .
        .
        <childn>...</child1>
    </OverallStatistics>
</Report>

i.e. retaining the parent node's attributes and retaining the sibling nodes. The split should be made only on the children in Statistics node.

Followed the workaround given in the link by changing the snippet as

package xmlsplitting;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import javax.xml.parsers.*;
import org.w3c.dom.*;
import org.xml.sax.*;
import javax.xml.transform.*; 
import javax.xml.transform.dom.DOMSource; 
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.*;
public class XmlSplit
{
    static public void main(String[] arg) throws Exception
    {
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        DocumentBuilder builder = factory.newDocumentBuilder();
        Document doc = builder.parse("D:\\Analyzer\\FileSplit\\Report.xml");
        TransformerFactory tranFactory = TransformerFactory.newInstance(); 
        Transformer aTransformer = tranFactory.newTransformer(); 
        XPath xpath = XPathFactory.newInstance().newXPath();
        NodeList list = (NodeList)xpath.evaluate("//Report/Statistics/child", doc, XPathConstants.NODESET);
        for (int i=1; i<list.getLength(); i++)
        {
            Node element = list.item(i).cloneNode(true);
            if(element.hasChildNodes())
            {
                  Source src = new DOMSource(element); 
                  FileOutputStream fs = new FileOutputStream( "D:\\Analyzer\\FileSplit\\ReportSplit"+ i + ".xml");
                  Result dest = new StreamResult(fs);
                  aTransformer.transform(src, dest);
                  fs.close();
            }
        }
    }
}

The achieved XML files splits are:

ReportSplit1.xml

  <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
   <child value="abc">
      <subchild>...</subchild>
   </child>

ReportSplit2.xml

  <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
   <child value="xyz">
      <subchild>...</subchild>
   </child>

Could anyone provide a workaround to achieve the desired XML files splits?


Solution

  • Consider using XSLT, the declarative, special-purpose programming language to transform XML documents instead of XPath as you require whole document transformation. For your purposes, an embedded, dynamic XSLT run on a loop of values can output multiple XML files:

    XSLT Script (embedded below, example here uses 'abc' which is iteratively used and replaced)

    <xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
    <xsl:output version="1.0" encoding="UTF-8" indent="yes" />
    <xsl:strip-space elements="*"/>
    
      <!-- Identity Transform -->
      <xsl:template match="@*|node()">
        <xsl:copy>
          <xsl:apply-templates select="@*|node()"/>
        </xsl:copy>
      </xsl:template>
    
      <xsl:template match="child[not(@value='abc')]"/>
    
    </xsl:transform>
    

    Java Script

    import javax.xml.parsers.*;
    import javax.xml.transform.*;
    import javax.xml.transform.dom.DOMSource;
    import javax.xml.transform.stream.StreamResult;
    import javax.xml.transform.stream.StreamSource;
    import javax.xml.transform.OutputKeys;
    
    import java.io.*;
    import java.net.URISyntaxException;
    
    import org.w3c.dom.Document;
    import org.xml.sax.SAXException;
    public class XmlSplit {
        public static void main(String[] args) throws IOException, URISyntaxException,
                                                      SAXException, ParserConfigurationException,
                                                      TransformerException {
    
            // Load XML Source
            String inputXML = "/path/to/XMLSource.xml";
    
            // Declare XML Values Array
            String[] xmlVals = {"abc", "xyz"};
    
            // Iterate through Values running dynamic, embedded XSLT
            for (String s: xmlVals) {
                String outputXML = "/path/to/output_" + s + ".xml";
    
                String xslStr = String.join("\n",
                    "<xsl:transform xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\" version=\"1.0\">",
                    "<xsl:output version=\"1.0\" encoding=\"UTF-8\" indent=\"yes\" />",
                    "<xsl:strip-space elements=\"*\"/>",
                    "<xsl:template match=\"@*|node()\">",
                    "<xsl:copy>",
                    "<xsl:apply-templates select=\"@*|node()\"/>",
                    "</xsl:copy>",
                    "</xsl:template>",
                    "<xsl:template match=\"child[not(@value='"+ s +"')]\"/>",
                    "</xsl:transform>");
    
                Source xslt = new StreamSource(new StringReader(xslStr));            
                DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();            
                DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
                Document doc = docBuilder.parse (new File(inputXML));
    
                // XSLT Transformation  with pretty print
                TransformerFactory prettyPrint = TransformerFactory.newInstance();
                Transformer transformer = prettyPrint.newTransformer(xslt);
    
                transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
                transformer.setOutputProperty(OutputKeys.STANDALONE, "yes");
                transformer.setOutputProperty(OutputKeys.METHOD, "xml");
                transformer.setOutputProperty(OutputKeys.INDENT, "yes");
                transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
                transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");                        
    
                // Output Result to File
                DOMSource source = new DOMSource(doc);
                StreamResult result = new StreamResult(new File(outputXML));        
                transformer.transform(source, result);
            }
    
        }
    }