Search code examples
javaapache-poisaximport-from-excel

How to extract Date as Date Not As Text from Excel using POI Streaming (SAX XML)


I am parsing excel (xlsx) sheets in a web application. I am using Apache POI, the Streaming API because I have large files. Now Excel internally stores dates as number so e.g. I get 42035 instead of 1/31/2015. Because I am using XML parsing I cannot access the dateformatting methods provided by POI. Any ideas?

Here is the parser (from this example https://poi.apache.org/spreadsheet/how-to.html#xssf_sax_api , but changed):

/**
 * See org.xml.sax.helpers.DefaultHandler javadocs
 */
private static class SheetHandler extends DefaultHandler {

    private SharedStringsTable table;
    private String lastContents;
    private boolean nextIsString;
    boolean isFirstRow = true;
    private int quantityOfColumns;
    private int currentColumnNumber = 1;
    int currentRowNumber = 1;
    private int rowNumberOfLastCell = 1;
    private DataSet data = new DataSet();
    private Tuple tuple;

    static final Logger LOG = LoggerFactory.getLogger(SheetHandler.class);

    private SheetHandler(SharedStringsTable sst) {
        this.table = sst;
        LOG.debug("Sheethandler created");
    }

    @Override
    public void startElement(String uri, String localName, String name,
            Attributes attributes) throws SAXException {
        // c => cell
        LOG.debug("Reading element of type {} ", name);
        if (name.equals("c")) {
            rowNumberOfLastCell = currentRowNumber;
            String cellType = attributes.getValue("t");
            LOG.debug("Extracting cell {} with type {}", attributes.getValue("r"), cellType);
            currentRowNumber = extractIntFromString(attributes.getValue("r"));
            if (isFinishedRowHeaderRow()) {
                extractHeaders();
            }
            // Figure out if the value is an index in the SST (Static String Table)
            if (cellType != null && cellType.equals("s")) {
                System.out.println("We have content!");
                nextIsString = true;
            } else {
                nextIsString = false;
            }
        }
        // Clear contents cache
        lastContents = "";
    }

    private boolean isFinishedRowHeaderRow() {
        return (rowNumberOfLastCell != currentRowNumber) && isFirstRow;
    }

    private void extractHeaders() {
        quantityOfColumns = data.getHeaders().size();
        LOG.debug("{} rows detected", quantityOfColumns); 
        LOG.debug("{} headers parsed", data.getHeaders().size());
        isFirstRow = false;
        currentColumnNumber = 1;
        tuple = new Tuple(quantityOfColumns);
    }

    @Override
    public void characters(char[] ch, int start, int length)
            throws SAXException {
        lastContents += new String(ch, start, length);
        LOG.debug("Extracted value \"{}\"", lastContents);
    }

    @Override
    public void endElement(String uri, String localName, String name)
            throws SAXException {
        // Process the last contents as required.
        // Do now, as characters() may be called more than once
        if (nextIsString) {
            extractStringFromCell();
        }
        // v => contents of a cell
        // Output after we've seen the string contents
        if (name.equals("v")) {
            addExtractedValueToDataSet();
            LOG.debug("Ended parsing of column {}", currentColumnNumber);
            if (currentColumnNumber == (quantityOfColumns)) {
                concludeRow();
            } else {
                currentColumnNumber++;
            }
        }
    }

    private void concludeRow() {
        data.addRow(tuple);
        LOG.debug("Row added to DataSet");
        tuple = new Tuple(quantityOfColumns);
        currentColumnNumber = 1;
        LOG.debug("Row {} parsed", rowNumberOfLastCell);
    }

    private void addExtractedValueToDataSet() {
        if (isFirstRow) {
            data.getHeaders().add(lastContents);
        } else {
            tuple.getRowEntries()[currentColumnNumber - 1] = lastContents;
        }
    }

    private void extractStringFromCell() throws NumberFormatException {
        int idx = Integer.parseInt(lastContents);
        lastContents = new XSSFRichTextString(table.getEntryAt(idx)).toString();
        LOG.debug("Extracted value \"{}\"", lastContents);
        nextIsString = false;
    }

    public int extractIntFromString(String original) {
        char[] chars = original.toCharArray();
        for (int i = 0; i < chars.length; i++) {
            try {
                return Integer.valueOf(original.substring(i, original.length()));
            } catch (NumberFormatException ex) {
            }
        }
        return 0;
    }

    public DataSet getData() {
        return data;
    }
}

Solution

  • @Gagravarr Thanks for the reply, for some reason I overlooked this, which helped me solve my problem:

    https://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/xssf/eventusermodel/XLSX2CSV.java