Thanks a lot @tilman-hausherr and @mkl. I didn't think about filtering the fields and annotations. It took me some time, but I came up with the following version which works for my test documents. Feel free to give some input/thoughts, hopefully other developer can benefit from it :)
How does it work:
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget;
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.apache.pdfbox.pdmodel.interactive.form.PDField;
import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
public class PdfCleanerUtils {
private static final String EOF_MARKER = "%%EOF";
private static final Integer EOF_LENGTH = EOF_MARKER.length();
// Private constructor
private PdfCleanerUtils() {
}
public static byte[] sanitizePdfDocument(byte[] documentData) throws ServerException {
// Check if linearized
boolean isLinearized = isLinearized(documentData);
// Get the first EOF offset for non-linearized documents and the second EOF offset for linearized documents (Quite rare)
int offset = getOffset(documentData, isLinearized ? 2 : 1);
// Get the original byte range
byte[] originalPdfData = new byte[offset + EOF_LENGTH];
System.arraycopy(documentData, 0, originalPdfData, 0, offset + EOF_LENGTH);
// Load and parse the PDF document based on the original data we just got
try (PDDocument pdDocument = Loader.loadPDF(originalPdfData)) {
// Remove encryption and security protection if required
AccessPermission accessPermission = pdDocument.getCurrentAccessPermission();
if (!accessPermission.canModify()) {
pdDocument.setAllSecurityToBeRemoved(true);
}
// Remove certification if required
COSDictionary catalog = pdDocument.getDocumentCatalog().getCOSObject();
if (catalog.containsKey(COSName.PERMS)) {
catalog.removeItem(COSName.PERMS);
}
// Check for a remaining signature. This can be the case when the first signature was added with incremental = false.
// Signatures with incremental = true were already cut away by the EOF range because we drop the revisions
int numberOfSignatures = getNumberOfSignatures(pdDocument);
if (numberOfSignatures > 0) {
// Ensure there is exactly one signature. Otherwise, our EOF marker search was wrong
if (numberOfSignatures != 1) {
throw new ServerException("The original document has to contain exactly one signature because it was not incrementally signed. Signatures found: " + numberOfSignatures);
}
// Remove the remaining signature
removeSignatureFromNonIncrementallySignedPdf(pdDocument);
}
// Re-check and ensure no signatures exist
numberOfSignatures = getNumberOfSignatures(pdDocument);
if (numberOfSignatures != 0) {
throw new ServerException("The original document still contains signatures.");
}
// Ensure the document has at least one page
if (pdDocument.getNumberOfPages() == 0) {
throw new ServerException("The original document has no pages.");
}
// Write the original document loaded by pdfbox to filter out smaller issues
try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
pdDocument.save(byteArrayOutputStream);
return byteArrayOutputStream.toByteArray();
}
} catch (IOException exception) {
throw new ServerException("Unable to load the original PDF document: " + exception.getMessage(), exception);
}
}
private static boolean isLinearized(byte[] originalPdfData) {
// Parse the data and search for the linearized value in the first 1024 bytes
String text = new String(originalPdfData, 0, 1024, StandardCharsets.UTF_8);
return text.contains("/Linearized");
}
private static int getOffset(byte[] originalPdfData, int markerCount) {
// Store the number of EOF markers we passed by
int passedMarkers = 0;
// Iterate over all bytes and find the n.th marker. Return this as offset
for (int offset = 0; offset < originalPdfData.length - EOF_LENGTH; offset++) {
// Sub-search for the EOF marker
boolean found = true;
for (int j = 0; j < EOF_LENGTH; j++) {
if (originalPdfData[offset + j] != EOF_MARKER.charAt(j)) {
// Mismatching byte, set found to false and break
found = false;
break;
}
}
// Check if the EOF marker was found
if (found) {
// Increase the passed markers
passedMarkers++;
// Check if we found our marker
if (passedMarkers == markerCount) {
return offset;
}
}
}
// No EOF marker found - corrupted PDF document
throw new RuntimeException("The PDF-document has no EOF marker - it looks corrupted.");
}
private static int getNumberOfSignatures(PDDocument pdDocument) {
// Get the number of signatures
PDAcroForm acroForm = pdDocument.getDocumentCatalog().getAcroForm();
return acroForm != null ? pdDocument.getSignatureDictionaries().size() : 0;
}
private static void removeSignatureFromNonIncrementallySignedPdf(PDDocument pdDocument) throws IOException {
// Get the AcroForm or return
PDAcroForm acroForm = pdDocument.getDocumentCatalog().getAcroForm();
if (acroForm == null) {
return; // No AcroForm present
}
// Iterate over all fields in the AcroForm and filter out all signatures, but keep visual signature fields
List<PDField> updatedFields = new ArrayList<>();
for (PDField field : acroForm.getFields()) {
// Handle signature fields or just re-add the other field
if (field instanceof PDSignatureField signatureField) {
// Get the dictionary and the first potential widget
COSDictionary fieldDictionary = signatureField.getCOSObject();
PDAnnotationWidget widget = signatureField.getWidgets().isEmpty() ? null : signatureField.getWidgets().getFirst();
// Check for visibility. Only re-add signature fields and make them re-signable
if (!isInvisible(widget)) {
// Clear the signature field and make it re-usable
fieldDictionary.removeItem(COSName.V);
fieldDictionary.removeItem(COSName.DV);
signatureField.setReadOnly(false);
updatedFields.add(signatureField);
}
} else {
// Retain non-signature fields
updatedFields.add(field);
}
}
// Re-set the filtered AcroForm fields
acroForm.setFields(updatedFields);
// Iterate over all pages and their annotations and filter out all signature annotation
for (PDPage page : pdDocument.getPages()) {
// Filter the annotations for each page
List<PDAnnotation> updatedAnnotations = new ArrayList<>();
for (PDAnnotation annotation : page.getAnnotations()) {
if (annotation instanceof PDAnnotationWidget widget) {
// Check if the widget belongs to an invisible signature
if (widget.getCOSObject().containsKey(COSName.PARENT)) {
COSDictionary parentField = widget.getCOSObject().getCOSDictionary(COSName.PARENT);
if (parentField != null && isInvisible(widget)) {
// Skip an invisible signature widget
continue;
}
}
}
updatedAnnotations.add(annotation); // Retain all other annotations
}
// Re-set the filtered annotations for the page
page.setAnnotations(updatedAnnotations);
}
}
private static boolean isInvisible(PDAnnotationWidget widget) {
// A signature without an annotation widget is invisible
if (widget == null) {
return true;
}
// Check the rectangle for visibility. Null or width/height 0 means invisible
PDRectangle pdRectangle = widget.getRectangle();
return pdRectangle == null || pdRectangle.getWidth() == 0 && pdRectangle.getHeight() == 0;
}
}