清理PDF页面资源中未使用的图像

时间:2018-08-01 22:12:52

标签: pdfbox

如果有人问过这个问题,请原谅我,但我还没有找到任何匹配项。

我有一些PDF文件,其中图像在每个页面的资源上重复,但从未在其内容流中使用。我认为这导致PDFSplit命令创建非常肿的页面。是否有任何实用程序代码或示例可以清除此类未使用的资源?也许是我出发的起点?

Screenshot of the images

1 个答案:

答案 0 :(得分:5)

我能够通过收集页面内容流中使用的图像列表来清理每个页面的资源。然后使用图像列表检查页面的资源,并删除所有未使用的资源。有关实现的详细信息,请参见下面的PageExtractor.stripUnusedImages。

资源对象在页面之间共享,因此在删除图像之前,我还必须确保每个页面都有自己的资源对象副本。有关实现的详细信息,请参见下面的PageExtractor.copyResources。

分页器:

package org.apache.pdfbox.examples;

import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionGoTo;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;


public class PageExtractor {

    private final Logger log = LoggerFactory.getLogger(this.getClass());

    public PDDocument extractPage(PDDocument source, Integer pageNumber) throws IOException {
        PDDocument targetPdf = new PDDocument();

        targetPdf.getDocument().setVersion(source.getVersion());
        targetPdf.setDocumentInformation(source.getDocumentInformation());
        targetPdf.getDocumentCatalog().setViewerPreferences(source.getDocumentCatalog().getViewerPreferences());

        PDPage sourcePage = source.getPage(pageNumber);
        PDPage targetPage = targetPdf.importPage(sourcePage);
        targetPage.setResources(sourcePage.getResources());

        stripUnusedImages(targetPage);
        stripPageLinks(targetPage);

        return targetPdf;
    }

    /**
     *  Collect the images used from a custom PDFStreamEngine (BI and DO operators)
     *  Create an empty COSDictionary
     *  Loop through the page's XObjects that are images and add them to the new COSDictionary if they were found in the PDFStreamEngine
     *  Assign the newly filled COSDictionary to the page's resource as COSName.XOBJECT
     */
    protected void stripUnusedImages(PDPage page) throws IOException {
        PDResources resources = copyResources(page);
        COSDictionary pageObjects = (COSDictionary) resources.getCOSObject().getDictionaryObject(COSName.XOBJECT);
        COSDictionary newObjects = new COSDictionary();

        Set<String> imageNames = findImageNames(page);
        Iterable<COSName> xObjectNames = resources.getXObjectNames();
        for (COSName xObjectName : xObjectNames) {
            if (resources.isImageXObject(xObjectName)) {
                Boolean used = imageNames.contains(xObjectName.getName());
                if (used) {
                    newObjects.setItem(xObjectName, pageObjects.getItem(xObjectName));
                } else {
                    log.info("Found unused image: name={}", xObjectName.getName());
                }
            } else {
                newObjects.setItem(xObjectName, pageObjects.getItem(xObjectName));
            }
        }
        resources.getCOSObject().setItem(COSName.XOBJECT, newObjects);
        page.setResources(resources);
    }

    /**
     * It is necessary to copy the page's resources since it can be shared with other pages. We must ensure changes
     * to the resources are scoped to the current page.
     */
    protected PDResources copyResources(PDPage page) {
        return new PDResources(new COSDictionary(page.getResources().getCOSObject()));
    }

    protected Set<String> findImageNames(PDPage page) throws IOException {
        Set<String> imageNames = new HashSet<>();
        PdfImageStreamEngine engine = new PdfImageStreamEngine() {
            @Override
            void handleImage(Operator operator, List<COSBase> operands) {
                COSName name = (COSName) operands.get(0);
                imageNames.add(name.getName());
            }
        };
        engine.processPage(page);
        return imageNames;
    }

    /**
     * Borrowed from PDFBox page splitter
     *
     * @see org.apache.pdfbox.multipdf.Splitter#processAnnotations(org.apache.pdfbox.pdmodel.PDPage)
     */
    protected void stripPageLinks(PDPage imported) throws IOException {
        List<PDAnnotation> annotations = imported.getAnnotations();
        for (PDAnnotation annotation : annotations) {
            if (annotation instanceof PDAnnotationLink) {
                PDAnnotationLink link = (PDAnnotationLink) annotation;
                PDDestination destination = link.getDestination();
                if (destination == null && link.getAction() != null) {
                    PDAction action = link.getAction();
                    if (action instanceof PDActionGoTo) {
                        destination = ((PDActionGoTo) action).getDestination();
                    }
                }
                if (destination instanceof PDPageDestination) {
                    // TODO preserve links to pages within the splitted result
                    ((PDPageDestination) destination).setPage(null);
                }
            }
            // TODO preserve links to pages within the splitted result
            annotation.setPage(null);
        }
    }

}

用于分析页面图像的流阅读器:

package org.apache.pdfbox.examples;

import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorProcessor;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDTransparencyGroup;


import java.io.IOException;
import java.util.List;

abstract public class PdfImageStreamEngine extends PDFStreamEngine {

    PdfImageStreamEngine() {
        addOperator(new DrawObjectCounter());
    }

    abstract void handleImage(Operator operator, List<COSBase> operands);

    protected class DrawObjectCounter extends OperatorProcessor {
        @Override
        public void process(Operator operator, List<COSBase> operands) throws IOException {
            if (operands != null && isImage(operands.get(0))) {
                handleImage(operator, operands);
            }
        }

        protected Boolean isImage(COSBase base) throws IOException {
            if (!(base instanceof COSName)) {
                return false;
            }
            COSName name = (COSName)base;
            if (context.getResources().isImageXObject(name)) {
                return true;
            }
            PDXObject xObject = context.getResources().getXObject(name);
            if (xObject instanceof PDTransparencyGroup) {
                context.showTransparencyGroup((PDTransparencyGroup)xObject);
            } else if (xObject instanceof PDFormXObject) {
                context.showForm((PDFormXObject)xObject);
            }
            return false;
        }

        @Override
        public String getName() {
            return "Do";
        }
    }

}