Reports

from PIL import Image
import pytesseract
import zipfile
import os

# Path to the uploaded DOCX file
docx_path = "/mnt/data/tbk.docx"

# Extract images from the DOCX file
with zipfile.ZipFile(docx_path, 'r') as docx:
    # List all image files in the word/media directory
    image_files = [item for item in docx.namelist() if item.startswith("word/media/")]
    # Extract images to a temporary folder
    image_paths = []
    for image_file in image_files:
        image_data = docx.read(image_file)
        image_path = f"/mnt/data/{os.path.basename(image_file)}"
        with open(image_path, "wb") as img:
            img.write(image_data)
        image_paths.append(image_path)

# Perform OCR on all extracted images
ocr_results = {}
for path in image_paths:
    image = Image.open(path)
    text = pytesseract.image_to_string(image)
    ocr_results[path] = text

ocr_results
79598236