I am studying to use apache poi to get fields from a word document, My demo code as following, almost of them generated by ChatGPT, i take a small modification, but it do NOT work; snapshot for demo word also as following
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSimpleField;
import java.io.FileInputStream;
import java.util.List;
public class FindSimpleFieldInDocx {
public static void main(String[] args) {
try {
FileInputStream fis = new FileInputStream("/Users/pengbo/word/template1.docx");
XWPFDocument document = new XWPFDocument(fis);
List<XWPFParagraph> paragraphs = document.getParagraphs();
for (XWPFParagraph para : paragraphs) {
System.out.println(para.getText());
XmlCursor cursor = para.getCTP().newCursor();
cursor.selectPath("./*");
while (cursor.toNextSelection()) {
XmlCursor.TokenType tokenType = cursor.currentTokenType();
if (tokenType.equals(XmlCursor.TokenType.START)) {
if (CTSimpleField.type.getName().getLocalPart().equals(cursor.getName().getLocalPart())) {
//cannot reach here........
CTSimpleField simpleField = (CTSimpleField) cursor.getObject();
String fieldCode = simpleField.getInstr();
System.out.println("find out CTSimpleField: " + fieldCode);
}
}
}
cursor.dispose();
}
document.close();
fis.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
template1.docx snapshot template1.docx document file
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.2.2</version>
</dependency>
<!-- Apache POI OOXML -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.2</version>
</dependency>
<!-- Apache POI OOXML Schemas -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>4.1.2</version> <!-- 或使用与你的POI版本匹配的最新版本 -->
</dependency>
<!-- XMLBeans -->
<dependency>
<groupId>org.apache.xmlbeans</groupId>
<artifactId>xmlbeans</artifactId>
<version>5.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-to-slf4j</artifactId>
<version>2.17.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.logging.log4j/log4j-api -->
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.17.2</version>
</dependency>
someone can provided a demo code for me to how to get office word's field thru a word document? thx.
Comparing XML element names with XmlObject
schema type names is not the way to go. For a simple field the XML element local part name is fldSimple
while the schema type name is CT_SimpleField
. That doesn't match.
If you rreally want find the CTSimpleField
s, you seach for objects which are instances of CTSimpleField
: cursor.getObject() instanceof CTSimpleField
:
...
if (tokenType.equals(XmlCursor.TokenType.START)) {
if (cursor.getObject() instanceof CTSimpleField) {
//current object is instance of CTSimpleField
CTSimpleField simpleField = (CTSimpleField) cursor.getObject();
String fieldCode = simpleField.getInstr();
System.out.println("find out CTSimpleField: " + fieldCode);
}
}
...
This should find the fields, when stored as simple field.
But current Microsoft Word not stores all fields as simple fields. One can check this by unzipping the *.docx
ZIP archive and have a look into /word/document.xml
. For me the XML for a date field looks like so for example:
<w:p >
<w:r>
<w:fldChar w:fldCharType="begin"/>
</w:r>
<w:r>
<w:instrText xml:space="preserve"> DATE \* MERGEFORMAT </w:instrText>
</w:r>
<w:r>
<w:fldChar w:fldCharType="separate"/>
</w:r>
<w:r>
<w:t>05.06.2024</w:t>
</w:r>
<w:r>
<w:fldChar w:fldCharType="end"/>
</w:r>
</w:p>
To get this using Apache POI, another approach is needed.
Following complete example shows how to get CTSimpleField
s as well as CTFldChar
fields.
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlObject;
import org.apache.xmlbeans.SimpleValue;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSimpleField;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFldChar;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.STFldCharType;
import java.io.FileInputStream;
public class FindFieldsInDocx {
static void findAndPrintCTSimpleFields(XWPFParagraph paragraph) {
XmlCursor cursor = paragraph.getCTP().newCursor();
String declareNameSpaces = "declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' ";
cursor.selectPath(declareNameSpaces + ".//w:fldSimple");
while (cursor.toNextSelection()) {
if (cursor.getObject() instanceof CTSimpleField) {
//current object is instance of CTSimpleField
CTSimpleField simpleField = (CTSimpleField) cursor.getObject();
System.out.println(simpleField.getInstr() + " | " + ((SimpleValue)simpleField).getStringValue());
}
}
}
static void findAndPrintCTFldCharFields(XWPFParagraph paragraph) {
StringBuilder runFieldContent = new StringBuilder();
boolean fldCharFound = false;
for (XWPFRun run : paragraph.getRuns()) {
if (containsFldChar(run, STFldCharType.BEGIN)) {
fldCharFound = true;
}
if (fldCharFound) {
runFieldContent.append(((SimpleValue)run.getCTR()).getStringValue());
runFieldContent.append(" | ");
}
if (fldCharFound && containsFldChar(run, STFldCharType.END)) {
fldCharFound = false;
runFieldContent.append(((SimpleValue)run.getCTR()).getStringValue());
System.out.println(runFieldContent);
runFieldContent = new StringBuilder();
}
}
}
static boolean containsFldChar(XWPFRun run, STFldCharType.Enum type) {
XmlCursor cursor = run.getCTR().newCursor();
String declareNameSpaces = "declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' ";
cursor.selectPath(declareNameSpaces + ".//w:fldChar");
while (cursor.toNextSelection()) {
if (cursor.getObject() instanceof CTFldChar) {
CTFldChar fldChar = (CTFldChar) cursor.getObject();
if (fldChar.getFldCharType() == type) {
return true;
}
}
}
return false;
}
public static void main(String[] args) {
try ( FileInputStream fis = new FileInputStream("./template1.docx");
XWPFDocument document = new XWPFDocument(fis);) {
for (XWPFParagraph paragraph : document.getParagraphs()) {
findAndPrintCTSimpleFields(paragraph);
findAndPrintCTFldCharFields(paragraph);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}