Source code for ractogateway.rag.page_index._ocr

"""OCR backends for PageIndexRAG.

Each backend converts raw image bytes (PNG/JPEG) into extracted text.
All backends follow the same interface so they are interchangeable.

Available backends
------------------
- :class:`TesseractOcrBackend`   — free, offline, ``pytesseract`` wrapper
- :class:`EasyOcrBackend`        — deep-learning, 80+ languages, offline
- :class:`GoogleVisionBackend`   — Google Cloud Vision API
- :class:`GoogleDocumentAIBackend` — Google Document AI (tables, forms)
- :class:`AWSTextractBackend`    — AWS Textract (forms, tables, key-value)
- :class:`AzureDocumentIntelligenceBackend` — Azure Form Recognizer v4

Quick start::

    from ractogateway.rag.page_index import PageIndexRAG
    from ractogateway.rag.page_index._ocr import TesseractOcrBackend

    rag = PageIndexRAG(llm_kit=kit, ocr_backend=TesseractOcrBackend())
    rag.ingest("scanned_report.pdf")   # OCR fallback auto-triggered
"""

from __future__ import annotations

import asyncio
from abc import ABC, abstractmethod
from typing import Any


# ---------------------------------------------------------------------------
# Base
# ---------------------------------------------------------------------------


[docs] class BaseOcrBackend(ABC): """Abstract base class for OCR backends. Implementors must provide :meth:`extract_text`. An async default is provided via :meth:`aextract_text` that offloads the synchronous call to a thread-pool executor. """
[docs] @abstractmethod def extract_text(self, image_bytes: bytes, mime_type: str = "image/png") -> str: """Convert *image_bytes* to plain text. Parameters ---------- image_bytes: Raw image data (PNG, JPEG, TIFF, …). mime_type: MIME type hint used by cloud APIs (default ``"image/png"``). Returns ------- str Extracted text, or an empty string if nothing was recognised. """
[docs] async def aextract_text( self, image_bytes: bytes, mime_type: str = "image/png" ) -> str: """Async variant of :meth:`extract_text` (thread-pool offload).""" loop = asyncio.get_event_loop() return await loop.run_in_executor(None, self.extract_text, image_bytes, mime_type)
# --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _require_pillow() -> Any: try: from PIL import Image except ImportError as exc: raise ImportError( "pillow is required for image-based OCR. " "Install it with: pip install ractogateway[rag-image]" ) from exc return Image def _bytes_to_pil(image_bytes: bytes) -> Any: import io Image = _require_pillow() return Image.open(io.BytesIO(image_bytes)) # --------------------------------------------------------------------------- # Tesseract (offline, free) # ---------------------------------------------------------------------------
[docs] class TesseractOcrBackend(BaseOcrBackend): """OCR via `Tesseract <https://github.com/tesseract-ocr/tesseract>`_. Requires ``pytesseract`` and a working Tesseract installation. Install with:: pip install ractogateway[rag-ocr-tesseract] # Also install Tesseract binary: https://github.com/UB-Mannheim/tesseract/wiki Parameters ---------- lang: Tesseract language string, e.g. ``"eng"`` (default), ``"eng+deu"``. config: Extra Tesseract config flags, e.g. ``"--psm 6"``. confidence_threshold: Pages where the mean word confidence is below this value (0–100) are flagged in the returned metadata; text is still returned. Set to ``0`` to disable filtering. """ def __init__( self, lang: str = "eng", config: str = "", confidence_threshold: float = 40.0, ) -> None: self._lang = lang self._config = config self._confidence_threshold = confidence_threshold
[docs] def extract_text(self, image_bytes: bytes, mime_type: str = "image/png") -> str: try: import pytesseract except ImportError as exc: raise ImportError( "pytesseract is required for TesseractOcrBackend. " "Install it with: pip install ractogateway[rag-ocr-tesseract]" ) from exc img = _bytes_to_pil(image_bytes) return pytesseract.image_to_string(img, lang=self._lang, config=self._config)
[docs] def extract_with_confidence(self, image_bytes: bytes) -> tuple[str, float]: """Return ``(text, mean_confidence)`` for confidence-aware ingestion.""" try: import pytesseract except ImportError as exc: raise ImportError( "pytesseract is required for TesseractOcrBackend. " "Install it with: pip install ractogateway[rag-ocr-tesseract]" ) from exc import pandas as pd # noqa: PLC0415 — lazy img = _bytes_to_pil(image_bytes) data = pytesseract.image_to_data( img, lang=self._lang, config=self._config, output_type=pytesseract.Output.DATAFRAME ) words = data[data["conf"] >= 0] mean_conf: float = float(words["conf"].mean()) if not words.empty else 0.0 text = " ".join(words["text"].dropna().astype(str).str.strip().loc[words["conf"] >= 0]) return text, mean_conf
# --------------------------------------------------------------------------- # EasyOCR (offline, deep-learning) # ---------------------------------------------------------------------------
[docs] class EasyOcrBackend(BaseOcrBackend): """OCR via `EasyOCR <https://github.com/JaidedAI/EasyOCR>`_. Deep-learning model; no cloud API required. Supports 80+ languages. Install with:: pip install ractogateway[rag-ocr-easy] Parameters ---------- languages: List of language codes, e.g. ``["en"]`` (default) or ``["en", "de"]``. gpu: Use CUDA GPU if available (default ``False`` for broad compatibility). """ def __init__(self, languages: list[str] | None = None, gpu: bool = False) -> None: self._languages = languages or ["en"] self._gpu = gpu self._reader: Any = None # lazy init def _get_reader(self) -> Any: if self._reader is None: try: import easyocr except ImportError as exc: raise ImportError( "easyocr is required for EasyOcrBackend. " "Install it with: pip install ractogateway[rag-ocr-easy]" ) from exc self._reader = easyocr.Reader(self._languages, gpu=self._gpu) return self._reader
[docs] def extract_text(self, image_bytes: bytes, mime_type: str = "image/png") -> str: import io reader = self._get_reader() results = reader.readtext(io.BytesIO(image_bytes), detail=0) return "\n".join(str(r) for r in results)
# --------------------------------------------------------------------------- # Google Cloud Vision # ---------------------------------------------------------------------------
[docs] class GoogleVisionBackend(BaseOcrBackend): """OCR via Google Cloud Vision API (``DOCUMENT_TEXT_DETECTION``). Install with:: pip install ractogateway[rag-ocr-google] Parameters ---------- credentials_path: Path to a service-account JSON key file. If ``None`` the SDK uses Application Default Credentials (``GOOGLE_APPLICATION_CREDENTIALS``). """ def __init__(self, credentials_path: str | None = None) -> None: self._credentials_path = credentials_path self._client: Any = None def _get_client(self) -> Any: if self._client is None: try: from google.cloud import vision from google.oauth2 import service_account except ImportError as exc: raise ImportError( "google-cloud-vision is required for GoogleVisionBackend. " "Install it with: pip install ractogateway[rag-ocr-google]" ) from exc if self._credentials_path: creds = service_account.Credentials.from_service_account_file( self._credentials_path ) self._client = vision.ImageAnnotatorClient(credentials=creds) else: self._client = vision.ImageAnnotatorClient() return self._client
[docs] def extract_text(self, image_bytes: bytes, mime_type: str = "image/png") -> str: try: from google.cloud import vision except ImportError as exc: raise ImportError( "google-cloud-vision is required for GoogleVisionBackend. " "Install it with: pip install ractogateway[rag-ocr-google]" ) from exc client = self._get_client() image = vision.Image(content=image_bytes) response = client.document_text_detection(image=image) if response.error.message: raise RuntimeError(f"Google Vision API error: {response.error.message}") return response.full_text_annotation.text or ""
# --------------------------------------------------------------------------- # Google Document AI # ---------------------------------------------------------------------------
[docs] class GoogleDocumentAIBackend(BaseOcrBackend): """OCR via `Google Document AI <https://cloud.google.com/document-ai>`_. Best for structured documents: tables, forms, invoices, contracts. Install with:: pip install ractogateway[rag-ocr-google] Parameters ---------- project_id: GCP project ID. processor_id: Document AI processor ID (e.g. an OCR or Form Parser processor). location: Processor region, usually ``"us"`` or ``"eu"`` (default ``"us"``). credentials_path: Optional path to a service-account JSON key. """ def __init__( self, project_id: str, processor_id: str, location: str = "us", credentials_path: str | None = None, ) -> None: self._project_id = project_id self._processor_id = processor_id self._location = location self._credentials_path = credentials_path self._client: Any = None def _get_client(self) -> Any: if self._client is None: try: from google.api_core.client_options import ClientOptions from google.cloud import documentai from google.oauth2 import service_account except ImportError as exc: raise ImportError( "google-cloud-documentai is required for GoogleDocumentAIBackend. " "Install it with: pip install ractogateway[rag-ocr-google]" ) from exc opts = ClientOptions( api_endpoint=f"{self._location}-documentai.googleapis.com" ) if self._credentials_path: creds = service_account.Credentials.from_service_account_file( self._credentials_path ) self._client = documentai.DocumentProcessorServiceClient( client_options=opts, credentials=creds ) else: self._client = documentai.DocumentProcessorServiceClient( client_options=opts ) return self._client
[docs] def extract_text(self, image_bytes: bytes, mime_type: str = "image/png") -> str: try: from google.cloud import documentai except ImportError as exc: raise ImportError( "google-cloud-documentai is required for GoogleDocumentAIBackend. " "Install it with: pip install ractogateway[rag-ocr-google]" ) from exc client = self._get_client() name = ( f"projects/{self._project_id}/locations/{self._location}" f"/processors/{self._processor_id}" ) raw_doc = documentai.RawDocument(content=image_bytes, mime_type=mime_type) request = documentai.ProcessRequest(name=name, raw_document=raw_doc) result = client.process_document(request=request) return result.document.text or ""
# --------------------------------------------------------------------------- # AWS Textract # ---------------------------------------------------------------------------
[docs] class AWSTextractBackend(BaseOcrBackend): """OCR via `AWS Textract <https://aws.amazon.com/textract/>`_. Best for forms and tables; uses ``DetectDocumentText`` for plain text. Install with:: pip install ractogateway[rag-ocr-aws] Parameters ---------- region_name: AWS region (default ``"us-east-1"``). aws_access_key_id / aws_secret_access_key: Optional explicit credentials; if omitted, boto3 uses the standard credential chain (env vars, ``~/.aws/credentials``, IAM role, etc.). """ def __init__( self, region_name: str = "us-east-1", aws_access_key_id: str | None = None, aws_secret_access_key: str | None = None, ) -> None: self._region = region_name self._key_id = aws_access_key_id self._secret = aws_secret_access_key self._client: Any = None def _get_client(self) -> Any: if self._client is None: try: import boto3 except ImportError as exc: raise ImportError( "boto3 is required for AWSTextractBackend. " "Install it with: pip install ractogateway[rag-ocr-aws]" ) from exc kwargs: dict[str, Any] = {"region_name": self._region} if self._key_id: kwargs["aws_access_key_id"] = self._key_id if self._secret: kwargs["aws_secret_access_key"] = self._secret self._client = boto3.client("textract", **kwargs) return self._client
[docs] def extract_text(self, image_bytes: bytes, mime_type: str = "image/png") -> str: client = self._get_client() response = client.detect_document_text(Document={"Bytes": image_bytes}) lines = [ block["Text"] for block in response.get("Blocks", []) if block["BlockType"] == "LINE" ] return "\n".join(lines)
# --------------------------------------------------------------------------- # Azure Document Intelligence (Form Recognizer v4) # ---------------------------------------------------------------------------
[docs] class AzureDocumentIntelligenceBackend(BaseOcrBackend): """OCR via `Azure Document Intelligence <https://learn.microsoft.com/azure/ai-services/document-intelligence/>`_. Previously called Azure Form Recognizer. The ``prebuilt-read`` model is used by default; swap in ``prebuilt-document`` for richer extraction. Install with:: pip install ractogateway[rag-ocr-azure] Parameters ---------- endpoint: Azure resource endpoint URL. api_key: Azure resource API key. model_id: Document Intelligence model to use (default ``"prebuilt-read"``). """ def __init__( self, endpoint: str, api_key: str, model_id: str = "prebuilt-read", ) -> None: self._endpoint = endpoint self._api_key = api_key self._model_id = model_id self._client: Any = None def _get_client(self) -> Any: if self._client is None: try: from azure.ai.documentintelligence import DocumentIntelligenceClient from azure.core.credentials import AzureKeyCredential except ImportError as exc: raise ImportError( "azure-ai-documentintelligence is required for " "AzureDocumentIntelligenceBackend. " "Install it with: pip install ractogateway[rag-ocr-azure]" ) from exc from azure.core.credentials import AzureKeyCredential self._client = DocumentIntelligenceClient( endpoint=self._endpoint, credential=AzureKeyCredential(self._api_key), ) return self._client
[docs] def extract_text(self, image_bytes: bytes, mime_type: str = "image/png") -> str: try: from azure.ai.documentintelligence.models import AnalyzeDocumentRequest except ImportError as exc: raise ImportError( "azure-ai-documentintelligence is required for " "AzureDocumentIntelligenceBackend. " "Install it with: pip install ractogateway[rag-ocr-azure]" ) from exc client = self._get_client() poller = client.begin_analyze_document( self._model_id, AnalyzeDocumentRequest(bytes_source=image_bytes), ) result = poller.result() paragraphs = [p.content for p in (result.paragraphs or []) if p.content] return "\n".join(paragraphs)