This (1.4M) can be decompressed with gzip no problem with say
gzip --stdout --decompress ./archiveteam_blip_20150813160102.cdx.gz
It decompresses to 61400
lines of text.
However, with this code based on zpipe.c
, the example script given on the libz website (run with ./zpipe archiveteam_blip_20150813160102.cdx.gz
, I only get 3000 lines of data:
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include "zlib.h"
#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
# include <fcntl.h>
# include <io.h>
# define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
#else
# define SET_BINARY_MODE(file)
#endif
#define CHUNK 16384
int inf(FILE *source, FILE *dest)
{
int ret;
unsigned have;
z_stream strm;
unsigned char in[CHUNK];
unsigned char out[CHUNK];
/* allocate inflate state */
strm.zalloc = Z_NULL;
strm.zfree = Z_NULL;
strm.opaque = Z_NULL;
strm.avail_in = 0;
strm.next_in = Z_NULL;
ret = inflateInit2(&strm, 16 + MAX_WBITS);
if (ret != Z_OK)
return ret;
/* decompress until deflate stream ends or end of file */
do {
strm.avail_in = fread(in, 1, CHUNK, source);
if (ferror(source)) {
(void)inflateEnd(&strm);
return Z_ERRNO;
}
if (strm.avail_in == 0)
break;
strm.next_in = in;
/* run inflate() on input until output buffer not full */
do {
strm.avail_out = CHUNK;
strm.next_out = out;
ret = inflate(&strm, Z_NO_FLUSH);
assert(ret != Z_STREAM_ERROR); /* state not clobbered */
switch (ret) {
case Z_NEED_DICT:
ret = Z_DATA_ERROR; /* and fall through */
case Z_DATA_ERROR:
case Z_MEM_ERROR:
(void)inflateEnd(&strm);
return ret;
}
have = CHUNK - strm.avail_out;
if (fwrite(out, 1, have, dest) != have || ferror(dest)) {
(void)inflateEnd(&strm);
return Z_ERRNO;
}
} while (strm.avail_out == 0);
/* done when inflate() says it's done */
} while (ret != Z_STREAM_END);
/* clean up and return */
(void)inflateEnd(&strm);
return ret == Z_STREAM_END ? Z_OK : Z_DATA_ERROR;
}
/* compress or decompress from stdin to stdout */
int main(int argc, char **argv)
{
int ret;
FILE *source = fopen(argv[1], "r");
/* avoid end-of-line conversions */
SET_BINARY_MODE(source);
SET_BINARY_MODE(stdout);
ret = inf(source, stdout);
if (ret != Z_OK)
{
fputs("zpipe: ", stderr);
switch (ret) {
case Z_ERRNO:
if (ferror(stdin))
fputs("error reading stdin\n", stderr);
if (ferror(stdout))
fputs("error writing stdout\n", stderr);
break;
case Z_STREAM_ERROR:
fputs("invalid compression level\n", stderr);
break;
case Z_DATA_ERROR:
fputs("invalid or incomplete deflate data\n", stderr);
break;
case Z_MEM_ERROR:
fputs("out of memory\n", stderr);
break;
case Z_VERSION_ERROR:
fputs("zlib version mismatch!\n", stderr);
}
}
return ret;
}
Your gzip file has multiple gzip members. A concatenation of any number of gzip streams is also a valid gzip stream, per the specification. From the documentation in zlib.h:
Unlike the gunzip utility and
gzread()
(see below),inflate()
will not automatically decode concatenated gzip members.inflate()
will returnZ_STREAM_END
at the end of the gzip member. The state would need to be reset to continue decoding a subsequent gzip member. This must be done if there is more data after a gzip member, in order for the decompression to be compliant with the gzip standard (RFC 1952).
Upon getting Z_STREAM_END
, you need to see if you have any data left (strm.avail_in != 0
) or if you can read in more data. If so, you need to do an inflateReset()
and decode the next gzip member by then continuing in the do
loop.