Search code examples
linuxcompressionzipxlsx

compression method for xlsx with 7z


I'm trying to programmatically modify an excel file (xlsx). I can successfully decompress, modify the xml as needed, and re-compress. However, i'm getting a warning everytime i open excel, even though it does read the file. I believe the error is due to the compression method used. This is an example of the closest I can get:

Decompress

7z x original.xlsx -o./decomp_xlsx

..Do some stuff..

Compress

7z a -tzip new ./decomp_xlsx/*

Rename

mv ./new.zip ./new.xlsx

The Error i get is: Excel found unreadable content in 'new.xlsx'. Do you want to recover the contents of this workbook? If you trust the source of this workbook, click Yes.

From ECMA-376-2 Office Open Formats Part 2 (Packaging Conventions) The compression algorithm supported is DEFLATE, as described in the .ZIP specification. The package implementer shall not use any compression algorithm other than DEFLATE.

So, what switches do i need to use in 7z or other linux compatible program to get the job done without the warning? I've tried dropping the -tzip and using -m0=COPY, but excel can't even recover from that one.

So here's the result of the zip program and zipinfo. I'm guessing i'm not going to find a tool to do this, other than the one provided below, so i'm going to award that answer, and see if i can find someone to translate to python for testing. I"m not sure it handles the differences between the 4.5 / 3.0, then b- / tx or the defS / defF though.

$ zipinfo original.xlsx
Archive:  original.xlsx
Zip file size: 228039 bytes, number of entries: 20
-rw----     4.5 fat     1969 b- defS 80-Jan-01 00:00 [Content_Types].xml
-rw----     4.5 fat      588 b- defS 80-Jan-01 00:00 _rels/.rels
-rw----     4.5 fat     1408 b- defS 80-Jan-01 00:00 xl/_rels/workbook.xml.rels
-rw----     4.5 fat      908 b- defS 80-Jan-01 00:00 xl/workbook.xml
-rw----     4.5 fat    35772 b- defS 80-Jan-01 00:00 xl/worksheets/sheet4.xml
-rw----     4.5 fat      322 b- defS 80-Jan-01 00:00 xl/worksheets/_rels/sheet4.xml.rels
-rw----     4.5 fat      322 b- defS 80-Jan-01 00:00 xl/worksheets/_rels/sheet1.xml.rels
-rw----     4.5 fat   230959 b- defS 80-Jan-01 00:00 xl/worksheets/sheet2.xml
-rw----     4.5 fat   263127 b- defS 80-Jan-01 00:00 xl/worksheets/sheet3.xml
-rw----     4.5 fat   295775 b- defS 80-Jan-01 00:00 xl/worksheets/sheet1.xml
-rw----     4.5 fat     1947 b- defS 80-Jan-01 00:00 xl/sharedStrings.xml
-rw----     4.5 fat    22698 b- defS 80-Jan-01 00:00 xl/styles.xml
-rw----     4.5 fat     7079 b- defS 80-Jan-01 00:00 xl/theme/theme1.xml
-rw----     4.5 fat      220 b- defS 80-Jan-01 00:00 xl/printerSettings/printerSettings2.bin
-rw----     4.5 fat   464247 b- defS 80-Jan-01 00:00 xl/externalLinks/externalLink1.xml
-rw----     4.5 fat      338 b- defS 80-Jan-01 00:00 xl/externalLinks/_rels/externalLink1.xml.rels
-rw----     4.5 fat      220 b- defS 80-Jan-01 00:00 xl/printerSettings/printerSettings1.bin
-rw----     4.5 fat      593 b- defS 80-Jan-01 00:00 docProps/core.xml
-rw----     4.5 fat    62899 b- defS 80-Jan-01 00:00 xl/calcChain.xml
-rw----     4.5 fat     1031 b- defS 80-Jan-01 00:00 docProps/app.xml
20 files, 1392422 bytes uncompressed, 223675 bytes compressed:  83.9%

$ zipinfo new.xlsx
Archive:  new.xlsx
Zip file size: 233180 bytes, number of entries: 20
-rw-r--r--  3.0 unx     1031 tx defF 80-Jan-01 00:00 docProps/app.xml
-rw-r--r--  3.0 unx      593 tx defF 80-Jan-01 00:00 docProps/core.xml
-rw-r--r--  3.0 unx    62899 tx defF 80-Jan-01 00:00 xl/calcChain.xml
-rw-r--r--  3.0 unx   464247 tx defF 80-Jan-01 00:00 xl/externalLinks/externalLink1.xml
-rw-r--r--  3.0 unx      338 tx defF 80-Jan-01 00:00 xl/externalLinks/_rels/externalLink1.xml.rels
-rw-r--r--  3.0 unx      220 bx defF 80-Jan-01 00:00 xl/printerSettings/printerSettings1.bin
-rw-r--r--  3.0 unx      220 bx defF 80-Jan-01 00:00 xl/printerSettings/printerSettings2.bin
-rw-r--r--  3.0 unx     1947 tx defF 80-Jan-01 00:00 xl/sharedStrings.xml
-rw-r--r--  3.0 unx    22698 tx defF 80-Jan-01 00:00 xl/styles.xml
-rw-r--r--  3.0 unx     7079 tx defF 80-Jan-01 00:00 xl/theme/theme1.xml
-rw-r--r--  3.0 unx      908 tx defF 80-Jan-01 00:00 xl/workbook.xml
-rw-r--r--  3.0 unx   295775 tx defF 80-Jan-01 00:00 xl/worksheets/sheet1.xml
-rw-r--r--  3.0 unx   230959 tx defF 80-Jan-01 00:00 xl/worksheets/sheet2.xml
-rw-r--r--  3.0 unx   263127 tx defF 80-Jan-01 00:00 xl/worksheets/sheet3.xml
-rw-r--r--  3.0 unx    35772 tx defF 80-Jan-01 00:00 xl/worksheets/sheet4.xml
-rw-r--r--  3.0 unx      322 tx defF 80-Jan-01 00:00 xl/worksheets/_rels/sheet1.xml.rels
-rw-r--r--  3.0 unx      322 tx defF 80-Jan-01 00:00 xl/worksheets/_rels/sheet4.xml.rels
-rw-r--r--  3.0 unx     1408 tx defF 80-Jan-01 00:00 xl/_rels/workbook.xml.rels
-rw-r--r--  3.0 unx     1969 tx defF 80-Jan-01 00:00 [Content_Types].xml
-rw-r--r--  3.0 unx      588 tx defF 80-Jan-01 00:00 _rels/.rels
20 files, 1392422 bytes uncompressed, 229608 bytes compressed:  83.5%

Solution

  • For some strange reason, Microsoft is looking at the operating system encoding in the "version needed to extract" in the local file headers and central directory headers. It wants those to be zero, but 7z is setting them to 3 for Unix. If you are intent on using 7z, then you will need to patch the resulting file.

    This program will do that:

    /* needz.c - zero the operating system byte for "version needed to extract" in
        the local and central headers of the zip files given on the command line.
        Placed in the public domain by Mark Adler, 23 Feb 2013. */
    
    #include <stdio.h>
    #include <stdlib.h>
    
    static void bail(char *why, char *what)
    {
        fprintf(stderr, "needz error: %s%s\n", why, what);
        exit(1);
    }
    
    /* Read len bytes from offset as a little-endian integer.  Negative offsets are
        considered to be from the end of the file. */
    static unsigned long peek(FILE *stream, off_t offset, int len)
    {
        int ret, shift;
        unsigned long val;
    
        ret = fseeko(stream, offset, offset < 0 ? SEEK_END : SEEK_SET);
        if (ret)
            bail("not a zip file", "");
        val = 0;
        shift = 0;
        while (len--) {
            ret = getc(stream);
            if (ret == EOF)
                bail("not a zip file", "");
            val += (unsigned long)ret << shift;
            shift += 8;
        }
        return val;
    }
    
    /* Write len bytes to offset from val as a little-endian integer.  Negative
        offsets are considered to be from the end of the file. */
    static void poke(FILE *stream, off_t offset, int len, unsigned long val)
    {
        int ret;
    
        ret = fseeko(stream, offset, offset < 0 ? SEEK_END : SEEK_SET);
        if (ret)
            bail("not a zip file", "");
        while (len--) {
            ret = putc(val, stream);
            if (ret == EOF)
                bail("could not write", "");
            val >>= 8;
        }
    }
    
    /* Zero out the OS byte in the extract fields.  This assumes the classic zip
        format (not Zip64), and no zip file comment. */
    static void zip_zero_os(char *path)
    {
        FILE *zip;
        unsigned entries;
        off_t central, local;
    
        zip = fopen(path, "r+b");
        if (zip == NULL)
            bail("could not open", path);
        if (peek(zip, -22, 4) != 0x06054b50)
            bail(path, " is not a zip file or has an end comment");
        entries = peek(zip, -12, 2);
        central = peek(zip, -6, 4);
        while (entries--) {
            if (peek(zip, central, 4) != 0x02014b50)
                bail(path, " has a structure error or is Zip64");
            poke(zip, central + 7, 1, 0);
            local = peek(zip, central + 42, 4);
            if (peek(zip, local, 4) != 0x04034b50)
                bail(path, " has a structure error or is Zip64");
            poke(zip, local + 5, 1, 0);
            central += 46 + peek(zip, central + 28, 2) +
                        peek(zip, central + 30, 2) + peek(zip, central + 32, 2);
        }
        if (fclose(zip) == EOF)
            bail("could not close ", path);
    }
    
    int main(int argc, char **argv)
    {
        while (--argc)
            zip_zero_os(*++argv);
        return 0;
    }