Search code examples
pdfbox

Cleaning up unused images in PDF page resources


Please forgive me if this has been asked but I have not found any matches yet.

I have some PDF files where images are duplicated on each page's resources but never used in its content stream. I think this is causing the PDFSplit command to create very bloated pages. Is there any utility code or examples to clean up unused resources like this? Maybe a starting point for me to get going?

Screenshot of the images


Solution

  • I was able to clean up the resources for each page by gathering a list of the images used inside the page's content stream. With the list of images, I then check the resources for the page and remove any that weren't used. See the PageExtractor.stripUnusedImages below for implementation details.

    The resource object was shared between pages so I also had to make sure each page had its own copy of the resource object before removing images. See PageExtractor.copyResources below for implementation details.

    The page splitter:

    package org.apache.pdfbox.examples;
    
    import org.apache.pdfbox.contentstream.operator.Operator;
    import org.apache.pdfbox.cos.COSBase;
    import org.apache.pdfbox.cos.COSDictionary;
    import org.apache.pdfbox.cos.COSName;
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.pdmodel.PDPage;
    import org.apache.pdfbox.pdmodel.PDResources;
    import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
    import org.apache.pdfbox.pdmodel.interactive.action.PDActionGoTo;
    import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
    import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
    import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
    import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    
    import java.io.IOException;
    import java.util.HashSet;
    import java.util.List;
    import java.util.Set;
    
    
    public class PageExtractor {
    
        private final Logger log = LoggerFactory.getLogger(this.getClass());
    
        public PDDocument extractPage(PDDocument source, Integer pageNumber) throws IOException {
            PDDocument targetPdf = new PDDocument();
    
            targetPdf.getDocument().setVersion(source.getVersion());
            targetPdf.setDocumentInformation(source.getDocumentInformation());
            targetPdf.getDocumentCatalog().setViewerPreferences(source.getDocumentCatalog().getViewerPreferences());
    
            PDPage sourcePage = source.getPage(pageNumber);
            PDPage targetPage = targetPdf.importPage(sourcePage);
            targetPage.setResources(sourcePage.getResources());
    
            stripUnusedImages(targetPage);
            stripPageLinks(targetPage);
    
            return targetPdf;
        }
    
        /**
         *  Collect the images used from a custom PDFStreamEngine (BI and DO operators)
         *  Create an empty COSDictionary
         *  Loop through the page's XObjects that are images and add them to the new COSDictionary if they were found in the PDFStreamEngine
         *  Assign the newly filled COSDictionary to the page's resource as COSName.XOBJECT
         */
        protected void stripUnusedImages(PDPage page) throws IOException {
            PDResources resources = copyResources(page);
            COSDictionary pageObjects = (COSDictionary) resources.getCOSObject().getDictionaryObject(COSName.XOBJECT);
            COSDictionary newObjects = new COSDictionary();
    
            Set<String> imageNames = findImageNames(page);
            Iterable<COSName> xObjectNames = resources.getXObjectNames();
            for (COSName xObjectName : xObjectNames) {
                if (resources.isImageXObject(xObjectName)) {
                    Boolean used = imageNames.contains(xObjectName.getName());
                    if (used) {
                        newObjects.setItem(xObjectName, pageObjects.getItem(xObjectName));
                    } else {
                        log.info("Found unused image: name={}", xObjectName.getName());
                    }
                } else {
                    newObjects.setItem(xObjectName, pageObjects.getItem(xObjectName));
                }
            }
            resources.getCOSObject().setItem(COSName.XOBJECT, newObjects);
            page.setResources(resources);
        }
    
        /**
         * It is necessary to copy the page's resources since it can be shared with other pages. We must ensure changes
         * to the resources are scoped to the current page.
         */
        protected PDResources copyResources(PDPage page) {
            return new PDResources(new COSDictionary(page.getResources().getCOSObject()));
        }
    
        protected Set<String> findImageNames(PDPage page) throws IOException {
            Set<String> imageNames = new HashSet<>();
            PdfImageStreamEngine engine = new PdfImageStreamEngine() {
                @Override
                void handleImage(Operator operator, List<COSBase> operands) {
                    COSName name = (COSName) operands.get(0);
                    imageNames.add(name.getName());
                }
            };
            engine.processPage(page);
            return imageNames;
        }
    
        /**
         * Borrowed from PDFBox page splitter
         *
         * @see org.apache.pdfbox.multipdf.Splitter#processAnnotations(org.apache.pdfbox.pdmodel.PDPage)
         */
        protected void stripPageLinks(PDPage imported) throws IOException {
            List<PDAnnotation> annotations = imported.getAnnotations();
            for (PDAnnotation annotation : annotations) {
                if (annotation instanceof PDAnnotationLink) {
                    PDAnnotationLink link = (PDAnnotationLink) annotation;
                    PDDestination destination = link.getDestination();
                    if (destination == null && link.getAction() != null) {
                        PDAction action = link.getAction();
                        if (action instanceof PDActionGoTo) {
                            destination = ((PDActionGoTo) action).getDestination();
                        }
                    }
                    if (destination instanceof PDPageDestination) {
                        // TODO preserve links to pages within the splitted result
                        ((PDPageDestination) destination).setPage(null);
                    }
                }
                // TODO preserve links to pages within the splitted result
                annotation.setPage(null);
            }
        }
    
    }
    

    The stream reader used to analyze the page's images:

    package org.apache.pdfbox.examples;
    
    import org.apache.pdfbox.contentstream.PDFStreamEngine;
    import org.apache.pdfbox.contentstream.operator.Operator;
    import org.apache.pdfbox.contentstream.operator.OperatorProcessor;
    import org.apache.pdfbox.cos.COSBase;
    import org.apache.pdfbox.cos.COSName;
    import org.apache.pdfbox.pdmodel.graphics.PDXObject;
    import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
    import org.apache.pdfbox.pdmodel.graphics.form.PDTransparencyGroup;
    
    
    import java.io.IOException;
    import java.util.List;
    
    abstract public class PdfImageStreamEngine extends PDFStreamEngine {
    
        PdfImageStreamEngine() {
            addOperator(new DrawObjectCounter());
        }
    
        abstract void handleImage(Operator operator, List<COSBase> operands);
    
        protected class DrawObjectCounter extends OperatorProcessor {
            @Override
            public void process(Operator operator, List<COSBase> operands) throws IOException {
                if (operands != null && isImage(operands.get(0))) {
                    handleImage(operator, operands);
                }
            }
    
            protected Boolean isImage(COSBase base) throws IOException {
                if (!(base instanceof COSName)) {
                    return false;
                }
                COSName name = (COSName)base;
                if (context.getResources().isImageXObject(name)) {
                    return true;
                }
                PDXObject xObject = context.getResources().getXObject(name);
                if (xObject instanceof PDTransparencyGroup) {
                    context.showTransparencyGroup((PDTransparencyGroup)xObject);
                } else if (xObject instanceof PDFormXObject) {
                    context.showForm((PDFormXObject)xObject);
                }
                return false;
            }
    
            @Override
            public String getName() {
                return "Do";
            }
        }
    
    }