Search code examples
javaexcelapache-poi

Using Apache POI to extract excel text type attachment encode issue


I am now using Apache POI to extract the attachments from the Excel file, here is part of my code.

    Sheet sheetAt = workbook.getSheet(sheetName);
    Drawing<?> drawingPatriarch = sheetAt.getDrawingPatriarch();
    if (drawingPatriarch != null) {
        Iterator<?> iterator = drawingPatriarch.iterator();
        if (iterator.hasNext()) {
            Object next = iterator.next();
            if (next instanceof ObjectData) {
                ObjectData o = (ObjectData) next;
                IOUtil.write(o.getObjectData(), outputPath);
            } else if (next instanceof Picture) {
                Picture o = (Picture) next;
                IOUtil.write(o.getData(), outputPath);
            } 
        }
    }

When the attachment is a binary type, such as exe, png, etc., the file extracted in this way is normal, but if the attachment is a text type, such as txt, pdf, etc., the extracted file can not be opened normally, view the binary content, in addition to the original file has a lot of extra data, how do I parse this data.


Solution

  • I suspect your ObjectData is of type oleObject or Objekt-Manager-Shellobjekt. Those objects are stored in embedded oleObject*.bin files. Those files have file systems of their own which need to be read. To do so, first get the DirectoryEntry and DirectoryNode and then get the Ole10Native. Having that you can get the file data as byte[] data = ole10Native.getDataBuffer().

    Complete example:

    import java.io.FileInputStream;
    import java.io.FileOutputStream;
    
    import org.apache.poi.ss.usermodel.*;
    import org.apache.poi.poifs.filesystem.*;
    
    public class ExcelGetObjectData {
        
     public static void main(String[] args) throws Exception {
    
      //String inFilePath = "./ExcelExampleIn.xlsx"; String outFilePath = "./ExcelExampleOut.xlsx";
      String inFilePath = "./ExcelExampleIn.xls"; String outFilePath = "./ExcelExampleOut.xls";
      
      try (Workbook workbook = WorkbookFactory.create(new FileInputStream(inFilePath));
           FileOutputStream out = new FileOutputStream(outFilePath ) ) {
    
       Sheet sheet = workbook.getSheetAt(0);
       Drawing<?> drawingPatriarch = sheet.getDrawingPatriarch();
       if (drawingPatriarch != null) {
        for (Shape shape : drawingPatriarch) {
         System.out.println(shape);
         if (shape instanceof ObjectData) {
          ObjectData objectData = (ObjectData) shape;
          System.out.println(objectData.getFileName());
          System.out.println(objectData.getOLE2ClassName());
          System.out.println(objectData.getContentType());
          if(objectData.getOLE2ClassName().equals("Objekt-Manager-Shellobjekt")) {
           if (objectData.hasDirectoryEntry()) {
            DirectoryEntry directory = objectData.getDirectory();
            if (directory instanceof DirectoryNode) {
             DirectoryNode directoryNode = (DirectoryNode)directory;
             Ole10Native ole10Native = Ole10Native.createFromEmbeddedOleObject(directoryNode);
             System.out.println(ole10Native.getCommand());
             System.out.println(ole10Native.getFileName());
             System.out.println(ole10Native.getLabel());
             byte[] data = ole10Native.getDataBuffer(); //data now contains the embedded data
             try (FileOutputStream dataOut = new FileOutputStream("./" + ole10Native.getLabel())) {
              dataOut.write(data);
             }
            }
           }
          } else if(objectData.getOLE2ClassName().equals("...")) {
           //...
          }          
         } else if (shape instanceof /*other*/Object) {
          //...
         }
        }        
       }
       
       workbook.write(out);
      }
     }
    }
    

    Using EmbeddedExtractor extraction of all embedded objects could be done like so:

    import java.io.FileInputStream;
    import java.io.FileOutputStream;
    
    import org.apache.poi.ss.usermodel.*;
    import org.apache.poi.ss.extractor.EmbeddedExtractor;
    import org.apache.poi.ss.extractor.EmbeddedData;
    
    public class ExcelEmbeddedExtractor {
        
     public static void main(String[] args) throws Exception {
    
      String inFilePath = "./ExcelExampleIn.xlsx"; String outFilePath = "./ExcelExampleOut.xlsx";
      //String inFilePath = "./ExcelExampleIn.xls"; String outFilePath = "./ExcelExampleOut.xls";
      
      try (Workbook workbook = WorkbookFactory.create(new FileInputStream(inFilePath));
           FileOutputStream out = new FileOutputStream(outFilePath ) ) {
    
       Sheet sheet = workbook.getSheetAt(0);
       EmbeddedExtractor extractor = new EmbeddedExtractor();
       for (EmbeddedData embeddedData : extractor.extractAll(sheet)) {
        Shape shape = embeddedData.getShape();
        System.out.println(shape);    
        String filename = embeddedData.getFilename();
        System.out.println(filename);   
        byte[] data = embeddedData.getEmbeddedData(); //data now contains the embedded data
        try (FileOutputStream dataOut = new FileOutputStream("./" + filename)) {
         dataOut.write(data);
        }    
       }
       
       workbook.write(out);
      }
     }
    }