Cleaning up unused images in PDF page resources

Please forgive me if this has been asked but I have not found any matches yet.

I have some PDF files where images are duplicated on each page's resources but never used in its content stream. I think this is causing the PDFSplit command to create very bloated pages. Is there any utility code or examples to clean up unused resources like this? Maybe a starting point for me to get going?

Solution

I was able to clean up the resources for each page by gathering a list of the images used inside the page's content stream. With the list of images, I then check the resources for the page and remove any that weren't used. See the PageExtractor.stripUnusedImages below for implementation details.

The resource object was shared between pages so I also had to make sure each page had its own copy of the resource object before removing images. See PageExtractor.copyResources below for implementation details.

The page splitter:

package org.apache.pdfbox.examples;

import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionGoTo;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;


public class PageExtractor {

    private final Logger log = LoggerFactory.getLogger(this.getClass());

    public PDDocument extractPage(PDDocument source, Integer pageNumber) throws IOException {
        PDDocument targetPdf = new PDDocument();

        targetPdf.getDocument().setVersion(source.getVersion());
        targetPdf.setDocumentInformation(source.getDocumentInformation());
        targetPdf.getDocumentCatalog().setViewerPreferences(source.getDocumentCatalog().getViewerPreferences());

        PDPage sourcePage = source.getPage(pageNumber);
        PDPage targetPage = targetPdf.importPage(sourcePage);
        targetPage.setResources(sourcePage.getResources());

        stripUnusedImages(targetPage);
        stripPageLinks(targetPage);

        return targetPdf;
    }

    /**
     *  Collect the images used from a custom PDFStreamEngine (BI and DO operators)
     *  Create an empty COSDictionary
     *  Loop through the page's XObjects that are images and add them to the new COSDictionary if they were found in the PDFStreamEngine
     *  Assign the newly filled COSDictionary to the page's resource as COSName.XOBJECT
     */
    protected void stripUnusedImages(PDPage page) throws IOException {
        PDResources resources = copyResources(page);
        COSDictionary pageObjects = (COSDictionary) resources.getCOSObject().getDictionaryObject(COSName.XOBJECT);
        COSDictionary newObjects = new COSDictionary();

        Set<String> imageNames = findImageNames(page);
        Iterable<COSName> xObjectNames = resources.getXObjectNames();
        for (COSName xObjectName : xObjectNames) {
            if (resources.isImageXObject(xObjectName)) {
                Boolean used = imageNames.contains(xObjectName.getName());
                if (used) {
                    newObjects.setItem(xObjectName, pageObjects.getItem(xObjectName));
                } else {
                    log.info("Found unused image: name={}", xObjectName.getName());
                }
            } else {
                newObjects.setItem(xObjectName, pageObjects.getItem(xObjectName));
            }
        }
        resources.getCOSObject().setItem(COSName.XOBJECT, newObjects);
        page.setResources(resources);
    }

    /**
     * It is necessary to copy the page's resources since it can be shared with other pages. We must ensure changes
     * to the resources are scoped to the current page.
     */
    protected PDResources copyResources(PDPage page) {
        return new PDResources(new COSDictionary(page.getResources().getCOSObject()));
    }

    protected Set<String> findImageNames(PDPage page) throws IOException {
        Set<String> imageNames = new HashSet<>();
        PdfImageStreamEngine engine = new PdfImageStreamEngine() {
            @Override
            void handleImage(Operator operator, List<COSBase> operands) {
                COSName name = (COSName) operands.get(0);
                imageNames.add(name.getName());
            }
        };
        engine.processPage(page);
        return imageNames;
    }

    /**
     * Borrowed from PDFBox page splitter
     *
     * @see org.apache.pdfbox.multipdf.Splitter#processAnnotations(org.apache.pdfbox.pdmodel.PDPage)
     */
    protected void stripPageLinks(PDPage imported) throws IOException {
        List<PDAnnotation> annotations = imported.getAnnotations();
        for (PDAnnotation annotation : annotations) {
            if (annotation instanceof PDAnnotationLink) {
                PDAnnotationLink link = (PDAnnotationLink) annotation;
                PDDestination destination = link.getDestination();
                if (destination == null && link.getAction() != null) {
                    PDAction action = link.getAction();
                    if (action instanceof PDActionGoTo) {
                        destination = ((PDActionGoTo) action).getDestination();
                    }
                }
                if (destination instanceof PDPageDestination) {
                    // TODO preserve links to pages within the splitted result
                    ((PDPageDestination) destination).setPage(null);
                }
            }
            // TODO preserve links to pages within the splitted result
            annotation.setPage(null);
        }
    }

}

The stream reader used to analyze the page's images:

package org.apache.pdfbox.examples;

import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorProcessor;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDTransparencyGroup;


import java.io.IOException;
import java.util.List;

abstract public class PdfImageStreamEngine extends PDFStreamEngine {

    PdfImageStreamEngine() {
        addOperator(new DrawObjectCounter());
    }

    abstract void handleImage(Operator operator, List<COSBase> operands);

    protected class DrawObjectCounter extends OperatorProcessor {
        @Override
        public void process(Operator operator, List<COSBase> operands) throws IOException {
            if (operands != null && isImage(operands.get(0))) {
                handleImage(operator, operands);
            }
        }

        protected Boolean isImage(COSBase base) throws IOException {
            if (!(base instanceof COSName)) {
                return false;
            }
            COSName name = (COSName)base;
            if (context.getResources().isImageXObject(name)) {
                return true;
            }
            PDXObject xObject = context.getResources().getXObject(name);
            if (xObject instanceof PDTransparencyGroup) {
                context.showTransparencyGroup((PDTransparencyGroup)xObject);
            } else if (xObject instanceof PDFormXObject) {
                context.showForm((PDFormXObject)xObject);
            }
            return false;
        }

        @Override
        public String getName() {
            return "Do";
        }
    }

}