79263414

Date: 2024-12-08 22:00:36
Score: 0.5
Natty:
Report link

Thanks a lot @tilman-hausherr and @mkl. I didn't think about filtering the fields and annotations. It took me some time, but I came up with the following version which works for my test documents. Feel free to give some input/thoughts, hopefully other developer can benefit from it :)

How does it work:

import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget;
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.apache.pdfbox.pdmodel.interactive.form.PDField;
import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;

public class PdfCleanerUtils {

    private static final String EOF_MARKER = "%%EOF";

    private static final Integer EOF_LENGTH = EOF_MARKER.length();

    // Private constructor
    private PdfCleanerUtils() {
    }

    public static byte[] sanitizePdfDocument(byte[] documentData) throws ServerException {
        // Check if linearized
        boolean isLinearized = isLinearized(documentData);

        // Get the first EOF offset for non-linearized documents and the second EOF offset for linearized documents (Quite rare)
        int offset = getOffset(documentData, isLinearized ? 2 : 1);

        // Get the original byte range
        byte[] originalPdfData = new byte[offset + EOF_LENGTH];
        System.arraycopy(documentData, 0, originalPdfData, 0, offset + EOF_LENGTH);

        // Load and parse the PDF document based on the original data we just got
        try (PDDocument pdDocument = Loader.loadPDF(originalPdfData)) {
            // Remove encryption and security protection if required
            AccessPermission accessPermission = pdDocument.getCurrentAccessPermission();
            if (!accessPermission.canModify()) {
                pdDocument.setAllSecurityToBeRemoved(true);
            }

            // Remove certification if required
            COSDictionary catalog = pdDocument.getDocumentCatalog().getCOSObject();
            if (catalog.containsKey(COSName.PERMS)) {
                catalog.removeItem(COSName.PERMS);
            }

            // Check for a remaining signature. This can be the case when the first signature was added with incremental = false.
            // Signatures with incremental = true were already cut away by the EOF range because we drop the revisions
            int numberOfSignatures = getNumberOfSignatures(pdDocument);
            if (numberOfSignatures > 0) {
                // Ensure there is exactly one signature. Otherwise, our EOF marker search was wrong
                if (numberOfSignatures != 1) {
                    throw new ServerException("The original document has to contain exactly one signature because it was not incrementally signed. Signatures found: " + numberOfSignatures);
                }

                // Remove the remaining signature
                removeSignatureFromNonIncrementallySignedPdf(pdDocument);
            }

            // Re-check and ensure no signatures exist
            numberOfSignatures = getNumberOfSignatures(pdDocument);
            if (numberOfSignatures != 0) {
                throw new ServerException("The original document still contains signatures.");
            }

            // Ensure the document has at least one page
            if (pdDocument.getNumberOfPages() == 0) {
                throw new ServerException("The original document has no pages.");
            }

            // Write the original document loaded by pdfbox to filter out smaller issues
            try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
                pdDocument.save(byteArrayOutputStream);
                return byteArrayOutputStream.toByteArray();
            }
        } catch (IOException exception) {
            throw new ServerException("Unable to load the original PDF document: " + exception.getMessage(), exception);
        }
    }

    private static boolean isLinearized(byte[] originalPdfData) {
        // Parse the data and search for the linearized value in the first 1024 bytes
        String text = new String(originalPdfData, 0, 1024, StandardCharsets.UTF_8);
        return text.contains("/Linearized");
    }

    private static int getOffset(byte[] originalPdfData, int markerCount) {
        // Store the number of EOF markers we passed by
        int passedMarkers = 0;

        // Iterate over all bytes and find the n.th marker. Return this as offset
        for (int offset = 0; offset < originalPdfData.length - EOF_LENGTH; offset++) {
            // Sub-search for the EOF marker
            boolean found = true;
            for (int j = 0; j < EOF_LENGTH; j++) {
                if (originalPdfData[offset + j] != EOF_MARKER.charAt(j)) {
                    // Mismatching byte, set found to false and break
                    found = false;
                    break;
                }
            }

            // Check if the EOF marker was found
            if (found) {
                // Increase the passed markers
                passedMarkers++;

                // Check if we found our marker
                if (passedMarkers == markerCount) {
                    return offset;
                }
            }
        }

        // No EOF marker found - corrupted PDF document
        throw new RuntimeException("The PDF-document has no EOF marker - it looks corrupted.");
    }

    private static int getNumberOfSignatures(PDDocument pdDocument) {
        // Get the number of signatures
        PDAcroForm acroForm = pdDocument.getDocumentCatalog().getAcroForm();
        return acroForm != null ? pdDocument.getSignatureDictionaries().size() : 0;
    }

    private static void removeSignatureFromNonIncrementallySignedPdf(PDDocument pdDocument) throws IOException {
        // Get the AcroForm or return
        PDAcroForm acroForm = pdDocument.getDocumentCatalog().getAcroForm();
        if (acroForm == null) {
            return; // No AcroForm present
        }

        // Iterate over all fields in the AcroForm and filter out all signatures, but keep visual signature fields
        List<PDField> updatedFields = new ArrayList<>();
        for (PDField field : acroForm.getFields()) {
            // Handle signature fields or just re-add the other field
            if (field instanceof PDSignatureField signatureField) {
                // Get the dictionary and the first potential widget
                COSDictionary fieldDictionary = signatureField.getCOSObject();
                PDAnnotationWidget widget = signatureField.getWidgets().isEmpty() ? null : signatureField.getWidgets().getFirst();

                // Check for visibility. Only re-add signature fields and make them re-signable
                if (!isInvisible(widget)) {
                    // Clear the signature field and make it re-usable
                    fieldDictionary.removeItem(COSName.V);
                    fieldDictionary.removeItem(COSName.DV);
                    signatureField.setReadOnly(false);
                    updatedFields.add(signatureField);
                }
            } else {
                // Retain non-signature fields
                updatedFields.add(field);
            }
        }

        // Re-set the filtered AcroForm fields
        acroForm.setFields(updatedFields);

        // Iterate over all pages and their annotations and filter out all signature annotation
        for (PDPage page : pdDocument.getPages()) {
            // Filter the annotations for each page
            List<PDAnnotation> updatedAnnotations = new ArrayList<>();
            for (PDAnnotation annotation : page.getAnnotations()) {
                if (annotation instanceof PDAnnotationWidget widget) {
                    // Check if the widget belongs to an invisible signature
                    if (widget.getCOSObject().containsKey(COSName.PARENT)) {
                        COSDictionary parentField = widget.getCOSObject().getCOSDictionary(COSName.PARENT);
                        if (parentField != null && isInvisible(widget)) {
                            // Skip an invisible signature widget
                            continue;
                        }
                    }
                }
                updatedAnnotations.add(annotation); // Retain all other annotations
            }

            // Re-set the filtered annotations for the page
            page.setAnnotations(updatedAnnotations);
        }
    }

    private static boolean isInvisible(PDAnnotationWidget widget) {
        // A signature without an annotation widget is invisible
        if (widget == null) {
            return true;
        }

        // Check the rectangle for visibility. Null or width/height 0 means invisible
        PDRectangle pdRectangle = widget.getRectangle();
        return pdRectangle == null || pdRectangle.getWidth() == 0 && pdRectangle.getHeight() == 0;
    }
}
Reasons:
  • Blacklisted phrase (0.5): Thanks
  • Long answer (-1):
  • Has code block (-0.5):
  • User mentioned (1): @tilman-hausherr
  • User mentioned (0): @mkl
  • Self-answer (0.5):
Posted by: swaechter