Search code examples
cgzipzlib

libz does not extract entire gzip file


This (1.4M) can be decompressed with gzip no problem with say

gzip --stdout --decompress ./archiveteam_blip_20150813160102.cdx.gz

It decompresses to 61400 lines of text.

However, with this code based on zpipe.c, the example script given on the libz website (run with ./zpipe archiveteam_blip_20150813160102.cdx.gz, I only get 3000 lines of data:

#include <stdio.h>
#include <string.h>
#include <assert.h>
#include "zlib.h"

#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
#  include <fcntl.h>
#  include <io.h>
#  define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
#else
#  define SET_BINARY_MODE(file)
#endif

#define CHUNK 16384

int inf(FILE *source, FILE *dest)
{
    int ret;
    unsigned have;
    z_stream strm;
    unsigned char in[CHUNK];
    unsigned char out[CHUNK];

    /* allocate inflate state */
    strm.zalloc = Z_NULL;
    strm.zfree = Z_NULL;
    strm.opaque = Z_NULL;
    strm.avail_in = 0;
    strm.next_in = Z_NULL;
    ret = inflateInit2(&strm, 16 + MAX_WBITS);
    if (ret != Z_OK)
        return ret;

    /* decompress until deflate stream ends or end of file */
    do {
        strm.avail_in = fread(in, 1, CHUNK, source);
        if (ferror(source)) {
            (void)inflateEnd(&strm);
            return Z_ERRNO;
        }
        if (strm.avail_in == 0)
            break;
        strm.next_in = in;

        /* run inflate() on input until output buffer not full */
        do {
            strm.avail_out = CHUNK;
            strm.next_out = out;
            ret = inflate(&strm, Z_NO_FLUSH);
            assert(ret != Z_STREAM_ERROR);  /* state not clobbered */
            switch (ret) {
            case Z_NEED_DICT:
                ret = Z_DATA_ERROR;     /* and fall through */
            case Z_DATA_ERROR:
            case Z_MEM_ERROR:
                (void)inflateEnd(&strm);
                return ret;
            }
            have = CHUNK - strm.avail_out;
            if (fwrite(out, 1, have, dest) != have || ferror(dest)) {
                (void)inflateEnd(&strm);
                return Z_ERRNO;
            }
        } while (strm.avail_out == 0);

        /* done when inflate() says it's done */
    } while (ret != Z_STREAM_END);

    /* clean up and return */
    (void)inflateEnd(&strm);
    return ret == Z_STREAM_END ? Z_OK : Z_DATA_ERROR;
}

/* compress or decompress from stdin to stdout */
int main(int argc, char **argv)
{
    int ret;

    FILE *source = fopen(argv[1], "r");

    /* avoid end-of-line conversions */
    SET_BINARY_MODE(source);
    SET_BINARY_MODE(stdout);
    
    ret = inf(source, stdout);
    
    if (ret != Z_OK)
    {
        fputs("zpipe: ", stderr);
        switch (ret) {
        case Z_ERRNO:
            if (ferror(stdin))
                fputs("error reading stdin\n", stderr);
            if (ferror(stdout))
                fputs("error writing stdout\n", stderr);
            break;
        case Z_STREAM_ERROR:
            fputs("invalid compression level\n", stderr);
            break;
        case Z_DATA_ERROR:
            fputs("invalid or incomplete deflate data\n", stderr);
            break;
        case Z_MEM_ERROR:
            fputs("out of memory\n", stderr);
            break;
        case Z_VERSION_ERROR:
            fputs("zlib version mismatch!\n", stderr);
        }
    }
    
    return ret;
}

Solution

  • Your gzip file has multiple gzip members. A concatenation of any number of gzip streams is also a valid gzip stream, per the specification. From the documentation in zlib.h:

    Unlike the gunzip utility and gzread() (see below), inflate() will not automatically decode concatenated gzip members. inflate() will return Z_STREAM_END at the end of the gzip member. The state would need to be reset to continue decoding a subsequent gzip member. This must be done if there is more data after a gzip member, in order for the decompression to be compliant with the gzip standard (RFC 1952).

    Upon getting Z_STREAM_END, you need to see if you have any data left (strm.avail_in != 0) or if you can read in more data. If so, you need to do an inflateReset() and decode the next gzip member by then continuing in the do loop.