Search code examples
javadocx4j

Docx4j functionality to turn a document into JSON representation?


Is there a good way to convert a document into JSON representation to then display on a web page? (It is a requirement that the document is converted to JSON)

My Idea if there isn't a built in way to do this is to represent the Run/Paragraph structure as JSON Objects, but I feel like this wouldn't work as well once I start working with more complex Word Documents.


Solution

  • If you add:

    <dependency>
        <groupId>com.fasterxml.jackson.dataformat</groupId>
        <artifactId>jackson-dataformat-xml</artifactId>
        <version>2.11.3</version>
    </dependency>
    

    you can try something like:

    import org.docx4j.Docx4J;
    import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
    
    import com.fasterxml.jackson.databind.JsonNode;
    import com.fasterxml.jackson.databind.ObjectMapper;
    import com.fasterxml.jackson.dataformat.xml.XmlMapper;
    
    public class ConvertOutJSON  {
    
        static String inputfilepath = System.getProperty("user.dir") + "/sample-docs/sample-docxv2.docx";
    
        public static void main(String[] args)
                throws Exception {
            
            
            WordprocessingMLPackage wordMLPackage 
                = Docx4J.load(new java.io.File(inputfilepath));
    
    
            String xml = wordMLPackage.getMainDocumentPart().getXML();
    
            //System.out.println(xml);
    
            XmlMapper xmlMapper = new XmlMapper();
            JsonNode node = xmlMapper.readTree(xml);
    
            ObjectMapper jsonMapper = new ObjectMapper();
            //String json = jsonMapper.writeValueAsString(node);
            String json = jsonMapper.writerWithDefaultPrettyPrinter().writeValueAsString(node);
    
            System.out.println(json);
    
        }    
    }
    

    However in a quick test, I noticed some w:p nodes were not being emitted as JSON. I haven't looked to see whether they get dropped by Jackson at the readTree step or when ObjectMapper writes its output; you'll need to dig into Jackson to fix that.

    It is currently producing output like:

    {
      "Ignorable" : "w14 wp14",
      "body" : {
        "p" : {
          "rsidR" : "00D15781",
          "rsidRDefault" : "00D15781",
          "pPr" : {
            "ind" : {
              "left" : "0"
            }
          }
        },
        "tbl" : {
          "tblPr" : {
            "tblStyle" : {
              "val" : "TableGrid"
            },
            "tblW" : {
              "w" : "0",
              "type" : "auto"
            },
            "tblLook" : {
              "firstRow" : "1",
              "lastRow" : "0",
              "firstColumn" : "1",
              "lastColumn" : "0",
              "noHBand" : "0",
              "noVBand" : "1",
              "val" : "04A0"
            }
          },
          "tblGrid" : {
            "gridCol" : {
              "w" : "3561"
            }
          },
          "tr" : {
            "rsidR" : "00D15781",
            "tc" : {
              "tcPr" : {
                "tcW" : {
                  "w" : "7122",
                  "type" : "dxa"
                },
                "gridSpan" : {
                  "val" : "2"
                }
              },
              "p" : {
                "rsidR" : "00D15781",
                "rsidRDefault" : "00945132",
                "pPr" : {
                  "ind" : {
                    "left" : "0"
                  }
                },
                "r" : {
                  "t" : "Horizontal merge"
                }
              }
            }
          }
        },
        "sectPr" : {
          "rsidR" : "00D15781",
          "headerReference" : {
            "type" : "default",
            "id" : "rId12"
          },
          "pgSz" : {
            "w" : "11907",
            "h" : "16839",
            "code" : "9"
          },
          "pgMar" : {
            "top" : "720",
            "right" : "720",
            "bottom" : "720",
            "left" : "720",
            "header" : "720",
            "footer" : "720",
            "gutter" : "0"
          },
          "cols" : {
            "space" : "720"
          },
          "docGrid" : {
            "linePitch" : "360"
          }
        }
      }
    }