Search code examples
javams-wordapache-poi

try to get office word's field using apache poi, but do NOT work


I am studying to use apache poi to get fields from a word document, My demo code as following, almost of them generated by ChatGPT, i take a small modification, but it do NOT work; snapshot for demo word also as following

import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSimpleField;

import java.io.FileInputStream;
import java.util.List;

public class FindSimpleFieldInDocx {
    public static void main(String[] args) {
        try {

            FileInputStream fis = new FileInputStream("/Users/pengbo/word/template1.docx");
            XWPFDocument document = new XWPFDocument(fis);
            List<XWPFParagraph> paragraphs = document.getParagraphs();
            for (XWPFParagraph para : paragraphs) {
                System.out.println(para.getText());
                XmlCursor cursor = para.getCTP().newCursor();
                cursor.selectPath("./*");
                while (cursor.toNextSelection()) {
                    XmlCursor.TokenType tokenType = cursor.currentTokenType();
                    if (tokenType.equals(XmlCursor.TokenType.START)) {
                        if (CTSimpleField.type.getName().getLocalPart().equals(cursor.getName().getLocalPart())) {
                            //cannot reach here........
                            CTSimpleField simpleField = (CTSimpleField) cursor.getObject();
                            String fieldCode = simpleField.getInstr();
                            
                            System.out.println("find out CTSimpleField: " + fieldCode);
                        }
                    }
                }
                cursor.dispose();
            }
            document.close();
            fis.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

template1.docx snapshot template1.docx document file

        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>5.2.2</version>
        </dependency>

        <!-- Apache POI OOXML -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>5.2.2</version>
        </dependency>

        <!-- Apache POI OOXML Schemas -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml-schemas</artifactId>
            <version>4.1.2</version> <!-- 或使用与你的POI版本匹配的最新版本 -->
        </dependency>

        <!-- XMLBeans -->
        <dependency>
            <groupId>org.apache.xmlbeans</groupId>
            <artifactId>xmlbeans</artifactId>
            <version>5.1.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-to-slf4j</artifactId>
            <version>2.17.2</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.logging.log4j/log4j-api -->
        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-api</artifactId>
            <version>2.17.2</version>
        </dependency>

someone can provided a demo code for me to how to get office word's field thru a word document? thx.


Solution

  • Comparing XML element names with XmlObject schema type names is not the way to go. For a simple field the XML element local part name is fldSimple while the schema type name is CT_SimpleField. That doesn't match.

    If you rreally want find the CTSimpleFields, you seach for objects which are instances of CTSimpleField: cursor.getObject() instanceof CTSimpleField:

    ...
                    if (tokenType.equals(XmlCursor.TokenType.START)) {
                        if (cursor.getObject() instanceof CTSimpleField) {
                            //current object is instance of CTSimpleField
                            CTSimpleField simpleField = (CTSimpleField) cursor.getObject();
                            String fieldCode = simpleField.getInstr();
                            
                            System.out.println("find out CTSimpleField: " + fieldCode);
                        }
                    }
    ...
    

    This should find the fields, when stored as simple field.

    But current Microsoft Word not stores all fields as simple fields. One can check this by unzipping the *.docx ZIP archive and have a look into /word/document.xml. For me the XML for a date field looks like so for example:

    <w:p >
     <w:r>
      <w:fldChar w:fldCharType="begin"/>
     </w:r>
     <w:r>
      <w:instrText xml:space="preserve"> DATE \* MERGEFORMAT </w:instrText>
     </w:r>
     <w:r>
      <w:fldChar w:fldCharType="separate"/>
     </w:r>
     <w:r>
      <w:t>05.06.2024</w:t>
     </w:r>
     <w:r>
      <w:fldChar w:fldCharType="end"/>
     </w:r>
    </w:p>
    

    To get this using Apache POI, another approach is needed.

    Following complete example shows how to get CTSimpleFields as well as CTFldChar fields.

    import org.apache.poi.xwpf.usermodel.XWPFDocument;
    import org.apache.poi.xwpf.usermodel.XWPFParagraph;
    import org.apache.poi.xwpf.usermodel.XWPFRun;
    import org.apache.xmlbeans.XmlCursor;
    import org.apache.xmlbeans.XmlObject;
    import org.apache.xmlbeans.SimpleValue;
    import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSimpleField;
    import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFldChar;
    import org.openxmlformats.schemas.wordprocessingml.x2006.main.STFldCharType;
    
    import java.io.FileInputStream;
    
    public class FindFieldsInDocx {
        
        static void findAndPrintCTSimpleFields(XWPFParagraph paragraph) {
            XmlCursor cursor = paragraph.getCTP().newCursor();
            String declareNameSpaces = "declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' ";
            cursor.selectPath(declareNameSpaces + ".//w:fldSimple");
            while (cursor.toNextSelection()) {
                if (cursor.getObject() instanceof CTSimpleField) {
                    //current object is instance of CTSimpleField
                    CTSimpleField simpleField = (CTSimpleField) cursor.getObject();
                    System.out.println(simpleField.getInstr() + " | " + ((SimpleValue)simpleField).getStringValue());
                }
            }   
        }
        
        static void findAndPrintCTFldCharFields(XWPFParagraph paragraph) {
            StringBuilder runFieldContent = new StringBuilder();
            boolean fldCharFound = false;
            for (XWPFRun run : paragraph.getRuns()) {
                if (containsFldChar(run, STFldCharType.BEGIN)) {
                    fldCharFound = true;
                }
                if (fldCharFound) {
                    runFieldContent.append(((SimpleValue)run.getCTR()).getStringValue());
                    runFieldContent.append(" | ");
                }
                if (fldCharFound && containsFldChar(run, STFldCharType.END)) {
                    fldCharFound = false;           
                    runFieldContent.append(((SimpleValue)run.getCTR()).getStringValue());
                    System.out.println(runFieldContent);
                    runFieldContent = new StringBuilder();
                }
            }   
        }
        
        static boolean containsFldChar(XWPFRun run, STFldCharType.Enum type) {
            XmlCursor cursor = run.getCTR().newCursor();
            String declareNameSpaces = "declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' ";
            cursor.selectPath(declareNameSpaces + ".//w:fldChar");
            while (cursor.toNextSelection()) {
                if (cursor.getObject() instanceof CTFldChar) {
                    CTFldChar fldChar = (CTFldChar) cursor.getObject();
                    if (fldChar.getFldCharType() == type) {
                        return true;
                    }
                }
            }
            return false;
        }
        
        public static void main(String[] args) {
            try ( FileInputStream fis = new FileInputStream("./template1.docx");
                  XWPFDocument document = new XWPFDocument(fis);) {
                for (XWPFParagraph paragraph : document.getParagraphs()) {
                    findAndPrintCTSimpleFields(paragraph);
                    findAndPrintCTFldCharFields(paragraph);
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }