Search code examples
pythonpdfpypdf

Image extracted from PDF using PyPDF2 is skewed, and its colors are inverted


I use the following code to extract images from PDF files:

from PIL import Image
from PyPDF2 import PdfFileReader, generic
from io import BytesIO
from typing import List
import zlib
import sys
import struct

def tiff_header_for_CCITT(width:int, height:int, img_size:int, CCITT_group:int=4) -> bytes:
    tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'l'
    return struct.pack(tiff_header_struct,
                       b'II',
                       42,
                       8,
                       8,
                       256, 4, 1, width,
                       257, 4, 1, height,
                       258, 3, 1, 1,
                       259, 3, 1, CCITT_group,
                       262, 3, 1, 0,
                       273, 4, 1, struct.calcsize(tiff_header_struct),
                       278, 4, 1, height,
                       279, 4, 1, img_size,
                       0
                       )

def object_to_images(objects:generic.DictionaryObject) -> List:
    images:List[Image] = []

    for idx, key in enumerate(objects):
        obj = objects[key] # object est un mot-clé réservé

        if '/Resources' in obj and '/XObject' in obj['/Resources']: # l'objet est un conteneur
            images += object_to_images(obj["/Resources"]["/XObject"].getObject()) # récursion
        elif obj['/Subtype'] == '/Image':
            print(obj)
            if "/ImageMask" in obj: # masque d'image, ignoré
                continue

            img_modes = {'/DeviceRGB': 'RGB',
                         '/DefaultRGB': 'RGB',
                         '/DeviceCMYK': 'CMYK',
                         '/DefaultCMYK': 'CMYK',
                         '/DeviceGray': 'L',
                         '/DefaultGray': 'L',
                         '/Indexed': 'P'}

            cspace = obj.get('/ColorSpace').getObject()

            indexed = False

            if cspace:
                if isinstance(cspace, generic.ArrayObject):
                    if cspace[0] == '/ICCBased':
                        color_map = cspace[1].getObject()['/N']
                        if color_map == 1:
                            colorspace = "P"
                        elif color_map == 3:
                            colorspace = "RGB"
                        elif color_map == 4:
                            colorspace = "CMYK"
                    elif cspace[0] == "/Indexed":
                        c, b, h, l = [v.getObject() for v in cspace]
                        indexed = True
                        colorspace = img_modes[c]
                else:
                    colorspace = img_modes[cspace]

            width = obj['/Width']
            height = obj['/Height']

            if "/FlateDecode" in obj["/Filter"]: # Compressé avec zlib
                data = zlib.decompress(obj._data)
            else:
                data = obj._data

            if "/DCTDecode" in obj["/Filter"]: # JPEG, rien à faire
                img = Image.open(BytesIO(data))
                images.append(img)
            elif "/JPXDecode" in obj["/Filter"]: # JPEG 2000, rien à faire
                img = Image.open(BytesIO(data))
                images.append(img)
            elif "CCITTFaxDecode" in obj["/Filter"]: # TIFF, rajouter l'en-tête
                if obj['/DecodeParms']['/K'] == -1:
                    CCITT_group = 4
                else:
                    CCITT_group = 3

                img_size = len(data)
                tiff_header = tiff_header_for_CCITT(width, height, img_size, CCITT_group)

                data = tiff_header + data
                img = Image.open(BytesIO(data))
                images.append(img)
            else:
                img = Image.frombytes(colorspace, (width, height), data)
                if indexed:
                    img.putpalette(l.getData())
                    img = img.convert('RGB')
                images.append(img)

    return images

def pdf_to_images(filename:str) -> List:
    images:List[Image] = []

    file = PdfFileReader(open(filename, "rb"))

    nPages = file.getNumPages()

    for i in range(nPages):
        page = file.getPage(i)
        try:
            root_objects = page["/Resources"]["/XObject"].getObject() # devrait s'appeler getObjects...
        except KeyError:
            continue
        images += object_to_images(root_objects)

    return images

if __name__ == "__main__":
    fichier = sys.argv[1]

    images = pdf_to_images(fichier)

    #print("\n".join(map(str, images)))

    for image in images:
        image.show()
        pass

It works for almost all PDF files, but one of them is acting weirdly. It's a sample PDF that can be found here. When I pass this PDF file to the code above, the result is skewed and the black and white colors are inverted.

The only thing I've noticed about the image object in this PDF is that it has a DecodeParms entry, whose value is {'/Predictor': 15, '/Columns': 2550, '/Colors': 3}. I don't know if it's relevant, but it's the only PDF I've tested that has those.

Thanks in advance


Solution

  • Yeah, it is relevant. The data was encoded using FlateDecode with a predictor. So if you need the original pixels for the Image object, you need to de-predict the data after you have used zlib.decompress. See the PNG spec for how to do that.

    PyPDF2 should probably also have some methods available to all this for you.