Search code examples
crun-length-encoding

RLE implementation failing every once in a while


I implemented the following two functions for RLE compression of binary files.

char* RLEcompress(char* data, size_t origSize, size_t* compressedSize) {
    char* ret = calloc(2 * origSize, 1);
    size_t retIdx = 0, inIdx = 0;
    size_t retSize = 0;
    while (inIdx < origSize) {
        size_t count = 1;
        size_t contIdx = inIdx;
        while (contIdx < origSize - 1 && data[inIdx] == data[++contIdx]) {
            count++;
        }
        size_t tmpCount = count;

        // break down counts with 2 or more digits into counts ≤ 9
        while (tmpCount > 9) {
            tmpCount -= 9;
            ret[retIdx++] = data[inIdx];
            ret[retIdx++] = data[inIdx];
            ret[retIdx++] = '9';
            retSize += 3;
        }

        ret[retIdx++] = data[inIdx];
        retSize += 1;
        if (tmpCount > 1) {
            // repeat character (this tells the decompressor that the next digit
            // is in fact the # of consecutive occurrences of this char)
            ret[retIdx++] = data[inIdx];
            // convert single-digit count to dataing
            ret[retIdx++] = '0' + tmpCount;
            retSize += 2;
        }

        inIdx += count;
    }
    *compressedSize = retSize;
    return ret;
}

char* RLEdecompress(char* data, size_t compressedSize, size_t uncompressedSize, size_t extraAllocation) {
    char* ret = calloc(uncompressedSize + extraAllocation, 1);
    size_t retIdx = 0, inIdx = 0;
    while (inIdx < compressedSize) {
        ret[retIdx++] = data[inIdx];
        if (data[inIdx] == data[inIdx + 1]) { // next digit is the # of occurrences
            size_t occ = ((data[inIdx + 2]) - '0');
            for (size_t i = 1; i < occ && retIdx < compressedSize; i++) {
                ret[retIdx++] = data[inIdx];
            }
            inIdx += 2;
        }
        inIdx += 1;
    }
    return ret;
}

They seem to work fine, i.e. diff doesn't produce any output when comparing the original files to the compressed-then-uncompressed versions.

However, every once in a while, the files will differ indicating there is a bug somewhere. I haven't been able to find a pattern in the files that exhibit this, but I'll give you an example of what the difference looks like.

enter image description here

The lower one is the original.

As you can see, the byte 21 is repeated twice in the compressed-then-uncompressed version. I haven't been able to identify the issue. Unfortunately the bug happens with very few files: so far I've only observed it with two pdf files, including the one shown above, but I can't share them because it's copyrighted content, but I'm working on coming up with another file that fails so I can provide you with an example.

I have a feeling there is something "obvious" wrong with the code above and I'm just missing it. Help is greatly appreciated.

EDIT:

Here's a test program I'm using to read the offending file, compressing it, then decompressing it. I'm also saving the compressed one to disk in a middle step to have more debug data.

int main(int argc, char** argv) {
    size_t compsz;
    FILE* fp = fopen(argv[1], "r");
    if (!fp) {
        perror("fp");
        return 1;
    }

    if (fseek(fp, 0L, SEEK_END) == -1) {
        return -1;
    }
    // get file size
    size_t filecontentLen = ftell(fp);
    if (filecontentLen < 0) {
        return -1;
    }
    rewind(fp);

    char* filecontentBuf = calloc(filecontentLen, 1);
    if (!filecontentBuf) {
        fclose(fp);
        errno = ENOMEM;
        return -1;
    }
    // read original
    if (fread(filecontentBuf, sizeof(char), filecontentLen, fp) <= 0) {
        int errnosave = errno;
        if (ferror(fp)) {
            fclose(fp);
            free(filecontentBuf);
            errno = errnosave;
            return -1;
        }
    }
    // write compressed
    char* compressed = RLEcompress(filecontentBuf, filecontentLen, &compsz);
    FILE* fpcompWrite = fopen("compressed", "w+");
    if (fwrite(compressed, compsz, 1, fpcompWrite) == -1) {
        perror("fwrite");
    }
    fclose(fpcompWrite);

    // read compressed
    FILE* fpcompRead = fopen("compressed", "r");
    if (!fpcompRead) {
        perror("fpcompRead");
        return 1;
    }
    char* compBuf = calloc(compsz * 2, 1);
    fread(compBuf, compsz, 1, fpcompRead);
    fclose(fpcompRead);

    // decompress and write file
    char* uncompBuf = RLEdecompress(compBuf, compsz, filecontentLen, 0);
    FILE* funcomp = fopen("uncompressed", "w+");
    fwrite(uncompBuf, filecontentLen, 1, funcomp);
    fclose(funcomp);
}

Solution

  • I think the problem is that

    for (size_t i = 1; i < occ && retIdx < compressedSize; i++) {
        ret[retIdx++] = data[inIdx];
    }
    

    should be changed in

    for (size_t i = 1; i < occ && retIdx < uncompressedSize; i++) {
        ret[retIdx++] = data[inIdx];
    }
    

    in the decompression algorithm, since redIdx is bounded by uncompressedSize, and maybe in some rare cases it copies fewer bytes than it should.