If the problem is restated as finding the white input boxes in the pdf (credit to comment from [K J](https://stackoverflow.com/users/10802527/k-j)) it can be solved fairly simply:
import fitz # PyMuPDF
import csv
INPUT_PDF = "input.pdf"
OUTPUT_PDF = "output.pdf"
OUTPUT_CSV = "output.csv"
def colour_match(color, target_color=(1, 1, 1)):
"""Return True if color exactly matches target_color, else False."""
return color == target_color
doc = fitz.open(INPUT_PDF)
# Page numbers are zero based
# pages_to_mark = list(range(len(doc))) # Default filter for all pages
pages_to_mark = [1] # Example: only process page 2
with open(OUTPUT_CSV, mode="w", newline="", encoding="utf-8") as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(["page_num", "x0", "y0", "x1", "y1"])
for page_num in pages_to_mark:
page = doc[page_num]
drawings = page.get_drawings()
shape = page.new_shape()
for d in drawings:
rect = d.get("rect")
fill_color = d.get("fill")
if rect and colour_match(fill_color, target_color=(1, 1, 1)):
x0, y0, x1, y1 = rect
cx, cy = x0, y1 # Lower-left corner for circle
# Draw circle on PDF page
shape.draw_circle((cx, cy), 2) # Radius = 2 points
# Write full rect coords and page number to CSV
csvwriter.writerow([page_num, x0, y0, x1, y1])
shape.finish(color=(0, 0, 1), fill=None) # Blue stroke circle, no fill
shape.commit()
doc.save(OUTPUT_PDF)
doc.close()
The following image demonstrates the solution by showing character boxes on page 2 which were not previously returned: