from PIL import Image
import pytesseract
import zipfile
import os
# Path to the uploaded DOCX file
docx_path = "/mnt/data/tbk.docx"
# Extract images from the DOCX file
with zipfile.ZipFile(docx_path, 'r') as docx:
# List all image files in the word/media directory
image_files = [item for item in docx.namelist() if item.startswith("word/media/")]
# Extract images to a temporary folder
image_paths = []
for image_file in image_files:
image_data = docx.read(image_file)
image_path = f"/mnt/data/{os.path.basename(image_file)}"
with open(image_path, "wb") as img:
img.write(image_data)
image_paths.append(image_path)
# Perform OCR on all extracted images
ocr_results = {}
for path in image_paths:
image = Image.open(path)
text = pytesseract.image_to_string(image)
ocr_results[path] = text
ocr_results