Apache POI SAX reader implemented similar to this well known example https://github.com/pjfanning/poi-shared-strings-sample/blob/master/src/main/java/com/github/pjfanning/poi/sample/XLSX2CSV.java reads some date values not as they are presented in excel despite it is supposed to read "formatted value".
Value in excel file : 1/1/2019, "formatted value" read by reader : 1/1/19.
Any idea why there is a difference?
Apache POI version 3.17
Reader code:
package com.lopuch.sk.lita.is.importer;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.log4j.Logger;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.util.ZipSecureFile;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.util.CellAddress;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.util.SAXHelper;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import com.lopuch.sk.lita.is.importer.fileImport.ExcelRowReadListener;
public class ExcelSaxImporter {
private static final Logger logger = Logger.getLogger(ExcelSaxImporter.class);
private ExcelRowReadListener listener;
public void setOnRowRead(ExcelRowReadListener listener) {
this.listener = listener;
}
public ExcelRowReadListener getListener() {
return listener;
};
public void process(byte[] fileByteArray)
throws IOException, OpenXML4JException, ParserConfigurationException, SAXException {
ZipSecureFile.setMinInflateRatio(0.0d);
OPCPackage opcpPackage = OPCPackage.open(new ByteArrayInputStream(fileByteArray));
ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(opcpPackage);
XSSFReader xssfReader = new XSSFReader(opcpPackage);
StylesTable styles = xssfReader.getStylesTable();
XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
while (iter.hasNext()) {
InputStream stream = iter.next();
processSheet(styles, strings, getHandler(), stream);
stream.close();
}
}
private SheetContentsHandler getHandler() {
return new SheetContentsHandler() {
private boolean firstCellOfRow = false;
private int currentRow = -1;
private int currentCol = -1;
// Maps column Letter name to its value.
// Does not contain key-value pair if cell value is null for
// currently
// processed column and row.
private Map<String, String> rowValues;
@Override
public void startRow(int rowNum) {
// Prepare for this row
firstCellOfRow = true;
currentRow = rowNum;
currentCol = -1;
rowValues = new HashMap<String, String>();
}
@Override
public void endRow(int rowNum) {
if (rowValues.keySet().size() == 0) {
logger.trace("Skipping calling rowRead() because of empty row");
} else {
ExcelSaxImporter.this.getListener().rowRead(rowValues);
}
}
@Override
public void cell(String cellReference, String formattedValue, XSSFComment comment) {
if (firstCellOfRow) {
firstCellOfRow = false;
}
// gracefully handle missing CellRef here in a similar way
// as XSSFCell does
if (cellReference == null) {
cellReference = new CellAddress(currentRow, currentCol).formatAsString();
}
// Did we miss any cells?
int thisCol = (new CellReference(cellReference)).getCol();
currentCol = thisCol;
cellReference = cellReference.replaceAll("\\d","");
rowValues.put(cellReference, formattedValue);
}
@Override
public void headerFooter(String text, boolean isHeader, String tagName) {
}
};
}
/**
* Parses and shows the content of one sheet using the specified styles and
* shared-strings tables.
*
* @param styles
* @param strings
* @param sheetInputStream
*/
public void processSheet(StylesTable styles, ReadOnlySharedStringsTable strings, SheetContentsHandler sheetHandler,
InputStream sheetInputStream) throws IOException, ParserConfigurationException, SAXException {
DataFormatter formatter = new DataFormatter();
InputSource sheetSource = new InputSource(sheetInputStream);
try {
XMLReader sheetParser = SAXHelper.newXMLReader();
ContentHandler handler = new XSSFSheetXMLHandler(styles, null, strings, sheetHandler, formatter, false);
sheetParser.setContentHandler(handler);
sheetParser.parse(sheetSource);
} catch (ParserConfigurationException e) {
throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
}
}
}
Difference in value displayed by excel and read by Apache POI comes from date formats that react to user language settings. From Excel:
Date formats that begin with an asterisk (*) responds to changes in regional date and time settings that are specified for the operating system.
Apache POI DataFormatter ignores these locale specific formats and returns default US format date. From Apache POI DataFormatter documentation:
Some formats are automatically "localized" by Excel, eg show as mm/dd/yyyy when loaded in Excel in some Locales but as dd/mm/yyyy in others. These are always returned in the "default" (US) format, as stored in the file.
To work around this behavior see answer to Java: excel to csv date conversion issue with Apache Poi