I ended trying the other solutions and comments, but always found I was getting an accuracy of maybe 95% which is not great for what I want to do.
I am now using easyocr with a seemingly 100% pass rate
from PyQt5.QtWidgets import QApplication, QMainWindow, QHBoxLayout, QWidget
from PyQt5.QtWebEngineWidgets import QWebEngineView, QWebEnginePage
from PyQt5.QtCore import QUrl, QTimer
import sys
import mss
from PIL import Image
from datetime import datetime
import easyocr
import numpy as np
class CustomWebEnginePage(QWebEnginePage):
def javaScriptConsoleMessage(self, level, message, lineNumber, sourceID):
pass # Suppresses output to terminal
class ScreenMonitorApp:
def __init__(self):
self.app = QApplication(sys.argv)
self.window = QMainWindow()
self.window.setGeometry(100, 100, 1400, 800)
central_widget = QWidget()
layout = QHBoxLayout(central_widget)
self.left_web = QWebEngineView()
self.left_web.setPage(CustomWebEnginePage(self.left_web))
self.right_web = QWebEngineView()
self.right_web.setPage(CustomWebEnginePage(self.right_web))
layout.addWidget(self.left_web, 1)
layout.addWidget(self.right_web, 1)
self.window.setCentralWidget(central_widget)
self.previous_text = ""
self.reader = easyocr.Reader(['en']) # Initialize EasyOCR reader for English
self.region = {"top": 80, "left": 80, "width": 78, "height": 30}
self.timer = QTimer()
self.timer.timeout.connect(self.check_region)
self.timer.start(2000)
screens = self.app.screens()
monitor_index = 3
if monitor_index < len(screens):
screen = screens[monitor_index]
geometry = screen.geometry()
x = geometry.x() + (geometry.width() - self.window.width()) // 2
y = geometry.y() + (geometry.height() - self.window.height()) // 2
self.window.move(x, y)
else:
print("Monitor index out of range. Opening on the primary monitor.")
self.window.show()
sys.exit(self.app.exec_())
def load_url(self, url_l, url_r):
print("URLs loaded")
self.left_web.setUrl(QUrl(f"https://example.com/"))
self.right_web.setUrl(QUrl(f"https://example.com/"))
def perform_ocr(self):
"""Capture screen region, resize 4x with Lanczos, convert to grayscale, and perform OCR with EasyOCR, saving the image for debug"""
with mss.mss() as sct:
img = sct.grab(self.region)
pil_img = Image.frombytes("RGB", img.size, img.bgra, "raw", "BGRX")
# Resize 4x with Lanczos resampling to increase effective DPI
pil_resized = pil_img.resize((234, 90), Image.LANCZOS) # Target ~300 DPI based on assumed 96 DPI
# Convert to grayscale
pil_gray = pil_resized.convert('L')
# Save the processed image with a timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
pil_gray.save(f"ocr_capture_{timestamp}.png", dpi=(300, 300)) # Set DPI to 300
# Convert PIL image to NumPy array for EasyOCR
img_np = np.array(pil_gray)
# Perform OCR with EasyOCR
result = self.reader.readtext(img_np, detail=0) # detail=0 returns only text, no bounding box/confidence
text = result[0] if result else "" # Take the first detected text, or empty string if none
return text
def check_region(self):
current_text = self.perform_ocr()
if current_text != self.previous_text and current_text:
self.previous_text = current_text
new_url_l = current_text
new_url_r = current_text
self.load_url(new_url_l, new_url_r)
print(f"Updated search for: {current_text}")
if __name__ == "__main__":
app = ScreenMonitorApp()