My goal is to create a zip archive from scratch using only zlib. I got pretty far, I can create an archive when I do not compress the data and set the appropriate header flags. But when I add compressed data to the archive and I try to unzip it with unzip
it gives me the error invalid compressed data to inflate
.
I have already tried to read the files from the created archive and decompress the file data to check if the compression is indeed corrupted. Though I am able to decompress all of the file data. This test section is at the bottom of the create
function.
I also tried extracting the archive with other tools and even online zip extracters but they all fail.
Currently this example does not support ZIP64, since I would like to get it working first and then expand from there.
I am aware that this is not a fully reproducible example but I can not include all of the necessary code, and I am really stuck with this code. Though all of the code that is not included but implemented is thoroughly tested. The Array
class works just like std::vector
. The String
class also works a lot like std::string
. The File
class is just a wrapper around FILE*
. And the Compression
class just compresses the data with zlib. Every one of those classes are thoroughly tested and work correctly. SICE
is a shortcut for static inline constexpr
.
This the structure used to compress an archive.
struct Zip {
// Private.
private:
// ---------------------------------------------------------
// Structs.
struct Entry {
Path sub_path;
String data;
String compressed;
};
struct FileHeader {
SICE uint32_t signature = 0x04034b50;
uint16_t version = 20;
uint16_t general_flag = 0;
uint16_t compression_method = 8; // deflated.
uint16_t mod_time = 0;
uint16_t mod_date = 0;
uint32_t crc = 0; // TO ASSIGN.
uint32_t compressed_len; // TO ASSIGN.
uint32_t uncompressed_len; // TO ASSIGN.
uint16_t name_len; // TO ASSIGN.
uint16_t extra_field_len = 0;
// Equals (for debugging).
constexpr friend
bool operator ==(const FileHeader& x, const FileHeader& y) {
return
// x.signature == y.signature &&
x.version == y.version &&
x.general_flag == y.general_flag &&
x.compression_method == y.compression_method &&
x.mod_time == y.mod_time &&
x.mod_date == y.mod_date &&
x.crc == y.crc &&
x.compressed_len == y.compressed_len &&
x.uncompressed_len == y.uncompressed_len &&
x.name_len == y.name_len &&
x.extra_field_len == y.extra_field_len;
}
// Dump to pipe (for debugging).
constexpr friend
auto& operator <<(Pipe& pipe, const FileHeader& obj) {
return pipe << "FileHeader(\n"
" signature: " << obj.signature << ",\n" <<
" version: " << obj.version << ",\n" <<
" general_flag: " << obj.general_flag << ",\n" <<
" compression_method: " << obj.compression_method << ",\n" <<
" mod_time: " << obj.mod_time << ",\n" <<
" mod_date: " << obj.mod_date << ",\n" <<
" crc: " << obj.crc << ",\n" <<
" compressed_len: " << obj.compressed_len << ",\n" <<
" uncompressed_len: " << obj.uncompressed_len << ",\n" <<
" name_len: " << obj.name_len << ",\n" <<
" extra_field_len: " << obj.extra_field_len <<
")";
}
};
struct CentralDirHeader {
SICE uint32_t signature = 0x02014b50;
uint16_t made_version = 45;
uint16_t version = 20;
uint16_t general_flag = 0;
uint16_t compression_method = 8; // deflated.
uint16_t mod_time = 0;
uint16_t mod_date = 0;
uint32_t crc = 0; // TO ASSIGN.
uint32_t compressed_len; // TO ASSIGN.
uint32_t uncompressed_len; // TO ASSIGN.
uint16_t name_len; // TO ASSIGN.
uint16_t extra_field_len = 0;
uint16_t comment_len = 0;
uint16_t disk = 0;
uint16_t internal_file_attr = 0;
uint32_t external_file_attr = 0;
uint32_t relative_offset = 0;
};
struct EndOfCentralDir {
SICE uint32_t signature = 0x06054b50;
uint16_t disk = 0;
uint16_t start_central_disk = 0;
uint16_t start_disk_entries; // TO ASSIGN.
uint16_t entries; // TO ASSIGN.
uint32_t central_size; // TO ASSIGN.
uint32_t relative_offset; // TO ASSIGN
uint16_t comment_len = 0;
};
// ---------------------------------------------------------
// Attributes.
Path m_path;
// ---------------------------------------------------------
// Private functions.
// Compute CRC-32 checksum.
static
uint32_t compute_crc32(const char* data, const ullong& len) {
uLong crc = crc32(0L, Z_NULL, 0);
crc = crc32(crc, (uchar*) data, len);
return (uint32_t) crc;
}
// Write local file header for a file entry
void write_file_header(File& archive, const Entry& entry, const FileHeader& header) const {
// Write header.
archive.write((char*) &header.signature, sizeof(header.signature));
archive.write((char*) &header.version, sizeof(header.version));
archive.write((char*) &header.general_flag, sizeof(header.general_flag));
archive.write((char*) &header.compression_method, sizeof(header.compression_method));
archive.write((char*) &header.mod_time, sizeof(header.mod_time));
archive.write((char*) &header.mod_date, sizeof(header.mod_date));
archive.write((char*) &header.crc, sizeof(header.crc));
archive.write((char*) &header.compressed_len, sizeof(header.compressed_len));
archive.write((char*) &header.uncompressed_len, sizeof(header.uncompressed_len));
archive.write((char*) &header.name_len, sizeof(header.name_len));
archive.write((char*) &header.extra_field_len, sizeof(header.extra_field_len));
// File name
archive.write(entry.sub_path.c_str(), entry.sub_path.len());
// Flush to file.
archive.flush();
}
// Write central directory file header for a file entry
void write_central_dir_header(File& archive, const Entry& entry, const CentralDirHeader& header) const {
// Write header.
archive.write((char*) &header.signature, sizeof(header.signature));
archive.write((char*) &header.made_version, sizeof(header.made_version));
archive.write((char*) &header.version, sizeof(header.version));
archive.write((char*) &header.general_flag, sizeof(header.general_flag));
archive.write((char*) &header.compression_method, sizeof(header.compression_method));
archive.write((char*) &header.mod_time, sizeof(header.mod_time));
archive.write((char*) &header.mod_date, sizeof(header.mod_date));
archive.write((char*) &header.crc, sizeof(header.crc));
archive.write((char*) &header.compressed_len, sizeof(header.compressed_len));
archive.write((char*) &header.uncompressed_len, sizeof(header.uncompressed_len));
archive.write((char*) &header.name_len, sizeof(header.name_len));
archive.write((char*) &header.extra_field_len, sizeof(header.extra_field_len));
archive.write((char*) &header.comment_len, sizeof(header.comment_len));
archive.write((char*) &header.disk, sizeof(header.disk));
archive.write((char*) &header.internal_file_attr, sizeof(header.internal_file_attr));
archive.write((char*) &header.external_file_attr, sizeof(header.external_file_attr));
archive.write((char*) &header.relative_offset, sizeof(header.relative_offset));
// File name
archive.write(entry.sub_path.c_str(), entry.sub_path.len());
// Flush to file.
archive.flush();
}
// Write end of central directory record
void write_end_of_central_dir(File& archive, const EndOfCentralDir& header) const {
// Write header.
archive.write((char*) &header.signature, sizeof(header.signature));
archive.write((char*) &header.disk, sizeof(header.disk));
archive.write((char*) &header.start_central_disk, sizeof(header.start_central_disk));
archive.write((char*) &header.start_disk_entries, sizeof(header.start_disk_entries));
archive.write((char*) &header.entries, sizeof(header.entries));
archive.write((char*) &header.central_size, sizeof(header.central_size));
archive.write((char*) &header.relative_offset, sizeof(header.relative_offset));
archive.write((char*) &header.comment_len, sizeof(header.comment_len));
// Flush to file.
archive.flush();
}
// Public.
public:
// ---------------------------------------------------------
// Constructor.
// Default constructor.
constexpr
Zip() = default;
// Constructor from path.
constexpr
Zip(const Path& path) :
m_path(path) {}
// Copy constructor.
constexpr
Zip(const Zip& obj) :
m_path(obj.m_path) {}
// Move constructor.
constexpr
Zip(Zip&& obj) :
m_path(move(obj.m_path)) {}
// ---------------------------------------------------------
// Functions.
// Compress data.
/* @docs {
* @title: Create
* @description:
* Create a zip archive from a file or directory.
* @parameter: {
* @name: _source
* @description: The source file or directory.
* }
* @usage:
* vlib::Zip zip("/tmp/zip.archive");
* zip.create("/tmp/dir/");
} */
void create(const Path& _source) const {
// Vars.
Path source = _source; // Make non const for certain funcs
File output (m_path);
Compression compression (Z_BEST_COMPRESSION);
Array<Entry> entries;
Array<FileHeader> file_headers;
Array<CentralDirHeader> central_dir_headers;
EndOfCentralDir end_of_central_dir;
// Remove & check.
if (m_path.exists()) {
m_path.remove();
}
if (!source.exists()) {
throw exceptions::FileNotFoundError("File \"", source, "\" does not exist.");
}
// Open output file.
output.close();
output.open();
// Path is a file.
if (source.is_file()) {
throw exceptions::CreateError("TODO.");
}
// Path is a dir.
else {
// Vars.
// Create entries.
const ullong slice = source.len() + 1;
for (auto& path: source.paths()) {
// Skip.
if (path.is_dir()) {
continue;
}
// Vars.
String sub_path = path.slice(slice);
String data = path.load();
String compressed = compression.compress(data);
uint32_t crc = compute_crc32(data.data(), data.len());
// Append.
file_headers.append(FileHeader {
.crc = (uint32_t) crc,
.compressed_len = (uint32_t) compressed.len(),
.uncompressed_len = (uint32_t) data.len(),
.name_len = (uint16_t) sub_path.len(),
});
central_dir_headers.append(CentralDirHeader {
.crc = (uint32_t) crc,
.compressed_len = (uint32_t) compressed.len(),
.uncompressed_len = (uint32_t) data.len(),
.name_len = (uint16_t) sub_path.len(),
});
entries.append(Entry {
.sub_path = move(sub_path),
.data = move(data),
.compressed = move(compressed),
});
}
// Write files.
for (auto& index: entries.indexes()) {
central_dir_headers[index].relative_offset = ftell(output.file());
write_file_header(output, entries[index], file_headers[index]);
output.write(entries[index].compressed.data(), entries[index].compressed.len());
// output.write(entries[index].data.data(), entries[index].data.len());
}
// Write central dir.
end_of_central_dir.relative_offset = ftell(output.file());
for (auto& index: entries.indexes()) {
write_central_dir_header(output, entries[index], central_dir_headers[index]);
}
end_of_central_dir.start_disk_entries = entries.len();
end_of_central_dir.entries = entries.len();
end_of_central_dir.central_size = ftell(output.file()) - end_of_central_dir.relative_offset;
write_end_of_central_dir(output, end_of_central_dir);
}
// Close.
output.close();
// Read output to check if the written data can be decompressed.
// Which works.
String data = Path::load(m_path);
ullong pos = 0, file_header_index = 0;
constexpr uint signature_len = sizeof(uint32_t);
while (pos + signature_len <= data.len()) {
const uint32_t signature = *((uint32_t*) &data[pos]);
// File header.
if (signature == FileHeader::signature) {
print("Found file header at ", pos, ".");
FileHeader header;
pos += sizeof(uint32_t); // skip signature.
header.version = *((uint16_t*) &data[pos]);
pos += sizeof(uint16_t);
header.general_flag = *((uint16_t*) &data[pos]);
pos += sizeof(uint16_t);
header.compression_method = *((uint16_t*) &data[pos]);
pos += sizeof(uint16_t);
header.mod_time = *((uint16_t*) &data[pos]);
pos += sizeof(uint16_t);
header.mod_date = *((uint16_t*) &data[pos]);
pos += sizeof(uint16_t);
header.crc = *((uint32_t*) &data[pos]);
pos += sizeof(uint32_t);
header.compressed_len = *((uint32_t*) &data[pos]);
pos += sizeof(uint32_t);
header.uncompressed_len = *((uint32_t*) &data[pos]);
pos += sizeof(uint32_t);
header.name_len = *((uint16_t*) &data[pos]);
pos += sizeof(uint16_t);
header.extra_field_len = *((uint16_t*) &data[pos]);
pos += sizeof(uint16_t);
String sub_path (&data[pos], header.name_len);
pos += header.name_len;
String compressed (&data[pos], header.compressed_len);
compression.m_level = header.compression_method;
String raw = compression.decompress(compressed);
// They are all equal the original.
print("HEADER EQ = ", header == file_headers[file_header_index]);
print("SUBPATH EQ = ", sub_path == entries[file_header_index].sub_path);
print("DATA EQ = ", raw == entries[file_header_index].data);
++file_header_index;
}
// Central dir header.
else if (signature == CentralDirHeader::signature) {
print("Found central dir header at ", pos, ".");
}
// End of central dir header.
else if (signature == EndOfCentralDir::signature) {
print("Found end of central dir header at ", pos, ".");
}
pos += 1;
}
}
};
The zip is created with.
Zip zip("/tmp/archive.zip");
zip.create("/tmp/dir/");
Edit.
This the compress function used by Zip
.
The m_level
is used for the zlib compression level, by default Z_BEST_COMPRESSION
and m_limit
is used for the maximum compressible bytes.
String compress(const char* data, const ullong& len) const {
if (len == 0) { return String(); }
// Init.
// if (output.is_undefined()) {
// output.alloc(len / 2);
// }
// Verify if len input will fit into uint, type used for zlib's avail_in
if (len > m_limit) {
throw exceptions::LimitError(max_len_err);
}
// Verify length.
if (m_max != -1 && len > (Length) m_max) {
throw exceptions::LimitError(max_len_err);
}
z_stream deflate_s;
deflate_s.zalloc = Z_NULL;
deflate_s.zfree = Z_NULL;
deflate_s.opaque = Z_NULL;
deflate_s.avail_in = 0;
deflate_s.next_in = Z_NULL;
// The windowBits parameter is the base two logarithm of the window size (the size of the history buffer).
// It should be in the range 8..15 for this version of the library.
// Larger values of this parameter result in better compression at the expense of memory usage.
// This range of values also changes the decoding type:
// -8 to -15 for raw deflate
// 8 to 15 for zlib
// (8 to 15) + 16 for gzip
// (8 to 15) + 32 to automatically detect gzip/zlib header (decompression/inflate only)
constexpr int window_bits = 15 + 16; // gzip with windowbits of 15
constexpr int mem_level = 8;
// The memory requirements for deflate are (in bytes):
// (1 << (window_bits+2)) + (1 << (mem_level+9))
// with a default value of 8 for mem_level and our window_bits of 15
// this is 128Kb
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wold-style-cast"
if (deflateInit2(&deflate_s, m_level, Z_DEFLATED, window_bits, mem_level, Z_DEFAULT_STRATEGY) != Z_OK) {
throw exceptions::DeflateError(deflate_err);
}
#pragma GCC diagnostic pop
deflate_s.next_in = reinterpret_cast<z_const Bytef*>((char*) data);
deflate_s.avail_in = static_cast<uint>(len);
String output;
do {
Length increase = len / 2 + 1024;
output.expand(increase);
// There is no way we see that "increase" would not fit in an uint,
// hence we use static cast here to avoid -Wshorten-64-to-32 error
deflate_s.avail_out = static_cast<uint>(increase);
//deflate_s.next_out = reinterpret_cast<Bytef*>((&output.data()[0] + output.len()));
deflate_s.next_out = reinterpret_cast<Bytef*>((output.data() + output.len()));
// From http://www.zlib.net/zlib_how.html
// "deflate() has a return value that can indicate errors, yet we do not check it here.
// Why not? Well, it turns out that deflate() can do no wrong here."
// Basically only possible error is from deflateInit not working properly
deflate(&deflate_s, Z_FINISH);
output.len() += (increase - deflate_s.avail_out);
} while (deflate_s.avail_out == 0);
deflateEnd(&deflate_s);
return output;
}
I am using MacOS 13.4 and unzip 6.00 (i have also tried with a newer version but it results in the same problem).
Here are some zipinfo / unzip logs.
$ unzip -t ../archive.zip
Archive: ../archive.zip
testing: install
error: invalid compressed data to inflate
testing: README.md
error: invalid compressed data to inflate
At least one error was detected in ../archive.zip.
$ zipinfo ../archive.zip
Archive: ../archive.zip
Zip file size: 1142 bytes, number of entries: 2
-rw---- 4.5 fat 1242 b- defN 80-000-00 00:00 install
-rw---- 4.5 fat 917 b- defN 80-000-00 00:00 README.md
2 files, 2159 bytes uncompressed, 936 bytes compressed: 56.6%
$ zipinfo -v ../archive.zip
Archive: ../archive.zip
There is no zipfile comment.
End-of-central-directory record:
-------------------------------
Zip archive file size: 1142 (0000000000000476h)
Actual end-cent-dir record offset: 1120 (0000000000000460h)
Expected end-cent-dir record offset: 1120 (0000000000000460h)
(based on the length of the central directory and its expected offset)
This zipfile constitutes the sole disk of a single-part archive; its
central directory contains 2 entries.
The central directory is 108 (000000000000006Ch) bytes long,
and its (expected) offset in bytes from the beginning of the zipfile
is 1012 (00000000000003F4h).
Central directory entry #1:
---------------------------
install
offset of local header from start of archive: 0
(0000000000000000h) bytes
file system or operating system of origin: MS-DOS, OS/2 or NT FAT
version of encoding software: 4.5
minimum file system compatibility required: MS-DOS, OS/2 or NT FAT
minimum software version required to extract: 2.0
compression method: deflated
compression sub-type (deflation): normal
file security status: not encrypted
extended local header: no
file last modified on (DOS date/time): 1980 000 0 00:00:00
32-bit CRC value (hex): 440cf502
compressed size: 424 bytes
uncompressed size: 1242 bytes
length of filename: 7 characters
length of extra field: 0 bytes
length of file comment: 0 characters
disk number on which file begins: disk 1
apparent file type: binary
non-MSDOS external file attributes: 000000 hex
MS-DOS file attributes (00 hex): none
There is no file comment.
Central directory entry #2:
---------------------------
README.md
offset of local header from start of archive: 461
(00000000000001CDh) bytes
file system or operating system of origin: MS-DOS, OS/2 or NT FAT
version of encoding software: 4.5
minimum file system compatibility required: MS-DOS, OS/2 or NT FAT
minimum software version required to extract: 2.0
compression method: deflated
compression sub-type (deflation): normal
file security status: not encrypted
extended local header: no
file last modified on (DOS date/time): 1980 000 0 00:00:00
32-bit CRC value (hex): 1db2d91a
compressed size: 512 bytes
uncompressed size: 917 bytes
length of filename: 9 characters
length of extra field: 0 bytes
length of file comment: 0 characters
disk number on which file begins: disk 1
apparent file type: binary
non-MSDOS external file attributes: 000000 hex
MS-DOS file attributes (00 hex): none
There is no file comment.
You need to use raw deflate for zip entries. Set window_bits
to -15
.
You should be able to handle an input size greater than the maximum value of an unsigned int
. You already have a loop for multiple deflate()
calls, so you just need to update avail_in
, and only use Z_FINISH
when the last input is provided. Something like:
...
deflate_s.avail_in = 0;
do {
if (deflate_s.avail_in == 0) {
deflate_s.avail_in = len > UINT_MAX ? UINT_MAX : (uint)len;
len -= deflate_s.avail_in;
}
...
deflate(&deflate_s, len ? Z_NO_FLUSH : Z_FINISH);
...
} while (len && deflate_s.avail_out == 0);
...
There should be no need for an m_max
.