Source code for ractogateway.rag.page_index._ocr
"""OCR backends for PageIndexRAG.
Each backend converts raw image bytes (PNG/JPEG) into extracted text.
All backends follow the same interface so they are interchangeable.
Available backends
------------------
- :class:`TesseractOcrBackend` — free, offline, ``pytesseract`` wrapper
- :class:`EasyOcrBackend` — deep-learning, 80+ languages, offline
- :class:`GoogleVisionBackend` — Google Cloud Vision API
- :class:`GoogleDocumentAIBackend` — Google Document AI (tables, forms)
- :class:`AWSTextractBackend` — AWS Textract (forms, tables, key-value)
- :class:`AzureDocumentIntelligenceBackend` — Azure Form Recognizer v4
Quick start::
from ractogateway.rag.page_index import PageIndexRAG
from ractogateway.rag.page_index._ocr import TesseractOcrBackend
rag = PageIndexRAG(llm_kit=kit, ocr_backend=TesseractOcrBackend())
rag.ingest("scanned_report.pdf") # OCR fallback auto-triggered
"""
from __future__ import annotations
import asyncio
from abc import ABC, abstractmethod
from typing import Any
# ---------------------------------------------------------------------------
# Base
# ---------------------------------------------------------------------------
[docs]
class BaseOcrBackend(ABC):
"""Abstract base class for OCR backends.
Implementors must provide :meth:`extract_text`.
An async default is provided via :meth:`aextract_text` that offloads
the synchronous call to a thread-pool executor.
"""
[docs]
@abstractmethod
def extract_text(self, image_bytes: bytes, mime_type: str = "image/png") -> str:
"""Convert *image_bytes* to plain text.
Parameters
----------
image_bytes:
Raw image data (PNG, JPEG, TIFF, …).
mime_type:
MIME type hint used by cloud APIs (default ``"image/png"``).
Returns
-------
str
Extracted text, or an empty string if nothing was recognised.
"""
[docs]
async def aextract_text(
self, image_bytes: bytes, mime_type: str = "image/png"
) -> str:
"""Async variant of :meth:`extract_text` (thread-pool offload)."""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, self.extract_text, image_bytes, mime_type)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _require_pillow() -> Any:
try:
from PIL import Image
except ImportError as exc:
raise ImportError(
"pillow is required for image-based OCR. "
"Install it with: pip install ractogateway[rag-image]"
) from exc
return Image
def _bytes_to_pil(image_bytes: bytes) -> Any:
import io
Image = _require_pillow()
return Image.open(io.BytesIO(image_bytes))
# ---------------------------------------------------------------------------
# Tesseract (offline, free)
# ---------------------------------------------------------------------------
[docs]
class TesseractOcrBackend(BaseOcrBackend):
"""OCR via `Tesseract <https://github.com/tesseract-ocr/tesseract>`_.
Requires ``pytesseract`` and a working Tesseract installation.
Install with::
pip install ractogateway[rag-ocr-tesseract]
# Also install Tesseract binary: https://github.com/UB-Mannheim/tesseract/wiki
Parameters
----------
lang:
Tesseract language string, e.g. ``"eng"`` (default), ``"eng+deu"``.
config:
Extra Tesseract config flags, e.g. ``"--psm 6"``.
confidence_threshold:
Pages where the mean word confidence is below this value (0–100)
are flagged in the returned metadata; text is still returned.
Set to ``0`` to disable filtering.
"""
def __init__(
self,
lang: str = "eng",
config: str = "",
confidence_threshold: float = 40.0,
) -> None:
self._lang = lang
self._config = config
self._confidence_threshold = confidence_threshold
[docs]
def extract_text(self, image_bytes: bytes, mime_type: str = "image/png") -> str:
try:
import pytesseract
except ImportError as exc:
raise ImportError(
"pytesseract is required for TesseractOcrBackend. "
"Install it with: pip install ractogateway[rag-ocr-tesseract]"
) from exc
img = _bytes_to_pil(image_bytes)
return pytesseract.image_to_string(img, lang=self._lang, config=self._config)
[docs]
def extract_with_confidence(self, image_bytes: bytes) -> tuple[str, float]:
"""Return ``(text, mean_confidence)`` for confidence-aware ingestion."""
try:
import pytesseract
except ImportError as exc:
raise ImportError(
"pytesseract is required for TesseractOcrBackend. "
"Install it with: pip install ractogateway[rag-ocr-tesseract]"
) from exc
import pandas as pd # noqa: PLC0415 — lazy
img = _bytes_to_pil(image_bytes)
data = pytesseract.image_to_data(
img, lang=self._lang, config=self._config, output_type=pytesseract.Output.DATAFRAME
)
words = data[data["conf"] >= 0]
mean_conf: float = float(words["conf"].mean()) if not words.empty else 0.0
text = " ".join(words["text"].dropna().astype(str).str.strip().loc[words["conf"] >= 0])
return text, mean_conf
# ---------------------------------------------------------------------------
# EasyOCR (offline, deep-learning)
# ---------------------------------------------------------------------------
[docs]
class EasyOcrBackend(BaseOcrBackend):
"""OCR via `EasyOCR <https://github.com/JaidedAI/EasyOCR>`_.
Deep-learning model; no cloud API required. Supports 80+ languages.
Install with::
pip install ractogateway[rag-ocr-easy]
Parameters
----------
languages:
List of language codes, e.g. ``["en"]`` (default) or ``["en", "de"]``.
gpu:
Use CUDA GPU if available (default ``False`` for broad compatibility).
"""
def __init__(self, languages: list[str] | None = None, gpu: bool = False) -> None:
self._languages = languages or ["en"]
self._gpu = gpu
self._reader: Any = None # lazy init
def _get_reader(self) -> Any:
if self._reader is None:
try:
import easyocr
except ImportError as exc:
raise ImportError(
"easyocr is required for EasyOcrBackend. "
"Install it with: pip install ractogateway[rag-ocr-easy]"
) from exc
self._reader = easyocr.Reader(self._languages, gpu=self._gpu)
return self._reader
[docs]
def extract_text(self, image_bytes: bytes, mime_type: str = "image/png") -> str:
import io
reader = self._get_reader()
results = reader.readtext(io.BytesIO(image_bytes), detail=0)
return "\n".join(str(r) for r in results)
# ---------------------------------------------------------------------------
# Google Cloud Vision
# ---------------------------------------------------------------------------
[docs]
class GoogleVisionBackend(BaseOcrBackend):
"""OCR via Google Cloud Vision API (``DOCUMENT_TEXT_DETECTION``).
Install with::
pip install ractogateway[rag-ocr-google]
Parameters
----------
credentials_path:
Path to a service-account JSON key file. If ``None`` the SDK uses
Application Default Credentials (``GOOGLE_APPLICATION_CREDENTIALS``).
"""
def __init__(self, credentials_path: str | None = None) -> None:
self._credentials_path = credentials_path
self._client: Any = None
def _get_client(self) -> Any:
if self._client is None:
try:
from google.cloud import vision
from google.oauth2 import service_account
except ImportError as exc:
raise ImportError(
"google-cloud-vision is required for GoogleVisionBackend. "
"Install it with: pip install ractogateway[rag-ocr-google]"
) from exc
if self._credentials_path:
creds = service_account.Credentials.from_service_account_file(
self._credentials_path
)
self._client = vision.ImageAnnotatorClient(credentials=creds)
else:
self._client = vision.ImageAnnotatorClient()
return self._client
[docs]
def extract_text(self, image_bytes: bytes, mime_type: str = "image/png") -> str:
try:
from google.cloud import vision
except ImportError as exc:
raise ImportError(
"google-cloud-vision is required for GoogleVisionBackend. "
"Install it with: pip install ractogateway[rag-ocr-google]"
) from exc
client = self._get_client()
image = vision.Image(content=image_bytes)
response = client.document_text_detection(image=image)
if response.error.message:
raise RuntimeError(f"Google Vision API error: {response.error.message}")
return response.full_text_annotation.text or ""
# ---------------------------------------------------------------------------
# Google Document AI
# ---------------------------------------------------------------------------
[docs]
class GoogleDocumentAIBackend(BaseOcrBackend):
"""OCR via `Google Document AI <https://cloud.google.com/document-ai>`_.
Best for structured documents: tables, forms, invoices, contracts.
Install with::
pip install ractogateway[rag-ocr-google]
Parameters
----------
project_id:
GCP project ID.
processor_id:
Document AI processor ID (e.g. an OCR or Form Parser processor).
location:
Processor region, usually ``"us"`` or ``"eu"`` (default ``"us"``).
credentials_path:
Optional path to a service-account JSON key.
"""
def __init__(
self,
project_id: str,
processor_id: str,
location: str = "us",
credentials_path: str | None = None,
) -> None:
self._project_id = project_id
self._processor_id = processor_id
self._location = location
self._credentials_path = credentials_path
self._client: Any = None
def _get_client(self) -> Any:
if self._client is None:
try:
from google.api_core.client_options import ClientOptions
from google.cloud import documentai
from google.oauth2 import service_account
except ImportError as exc:
raise ImportError(
"google-cloud-documentai is required for GoogleDocumentAIBackend. "
"Install it with: pip install ractogateway[rag-ocr-google]"
) from exc
opts = ClientOptions(
api_endpoint=f"{self._location}-documentai.googleapis.com"
)
if self._credentials_path:
creds = service_account.Credentials.from_service_account_file(
self._credentials_path
)
self._client = documentai.DocumentProcessorServiceClient(
client_options=opts, credentials=creds
)
else:
self._client = documentai.DocumentProcessorServiceClient(
client_options=opts
)
return self._client
[docs]
def extract_text(self, image_bytes: bytes, mime_type: str = "image/png") -> str:
try:
from google.cloud import documentai
except ImportError as exc:
raise ImportError(
"google-cloud-documentai is required for GoogleDocumentAIBackend. "
"Install it with: pip install ractogateway[rag-ocr-google]"
) from exc
client = self._get_client()
name = (
f"projects/{self._project_id}/locations/{self._location}"
f"/processors/{self._processor_id}"
)
raw_doc = documentai.RawDocument(content=image_bytes, mime_type=mime_type)
request = documentai.ProcessRequest(name=name, raw_document=raw_doc)
result = client.process_document(request=request)
return result.document.text or ""
# ---------------------------------------------------------------------------
# AWS Textract
# ---------------------------------------------------------------------------
[docs]
class AWSTextractBackend(BaseOcrBackend):
"""OCR via `AWS Textract <https://aws.amazon.com/textract/>`_.
Best for forms and tables; uses ``DetectDocumentText`` for plain text.
Install with::
pip install ractogateway[rag-ocr-aws]
Parameters
----------
region_name:
AWS region (default ``"us-east-1"``).
aws_access_key_id / aws_secret_access_key:
Optional explicit credentials; if omitted, boto3 uses the standard
credential chain (env vars, ``~/.aws/credentials``, IAM role, etc.).
"""
def __init__(
self,
region_name: str = "us-east-1",
aws_access_key_id: str | None = None,
aws_secret_access_key: str | None = None,
) -> None:
self._region = region_name
self._key_id = aws_access_key_id
self._secret = aws_secret_access_key
self._client: Any = None
def _get_client(self) -> Any:
if self._client is None:
try:
import boto3
except ImportError as exc:
raise ImportError(
"boto3 is required for AWSTextractBackend. "
"Install it with: pip install ractogateway[rag-ocr-aws]"
) from exc
kwargs: dict[str, Any] = {"region_name": self._region}
if self._key_id:
kwargs["aws_access_key_id"] = self._key_id
if self._secret:
kwargs["aws_secret_access_key"] = self._secret
self._client = boto3.client("textract", **kwargs)
return self._client
[docs]
def extract_text(self, image_bytes: bytes, mime_type: str = "image/png") -> str:
client = self._get_client()
response = client.detect_document_text(Document={"Bytes": image_bytes})
lines = [
block["Text"]
for block in response.get("Blocks", [])
if block["BlockType"] == "LINE"
]
return "\n".join(lines)
# ---------------------------------------------------------------------------
# Azure Document Intelligence (Form Recognizer v4)
# ---------------------------------------------------------------------------
[docs]
class AzureDocumentIntelligenceBackend(BaseOcrBackend):
"""OCR via `Azure Document Intelligence
<https://learn.microsoft.com/azure/ai-services/document-intelligence/>`_.
Previously called Azure Form Recognizer. The ``prebuilt-read`` model
is used by default; swap in ``prebuilt-document`` for richer extraction.
Install with::
pip install ractogateway[rag-ocr-azure]
Parameters
----------
endpoint:
Azure resource endpoint URL.
api_key:
Azure resource API key.
model_id:
Document Intelligence model to use (default ``"prebuilt-read"``).
"""
def __init__(
self,
endpoint: str,
api_key: str,
model_id: str = "prebuilt-read",
) -> None:
self._endpoint = endpoint
self._api_key = api_key
self._model_id = model_id
self._client: Any = None
def _get_client(self) -> Any:
if self._client is None:
try:
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.core.credentials import AzureKeyCredential
except ImportError as exc:
raise ImportError(
"azure-ai-documentintelligence is required for "
"AzureDocumentIntelligenceBackend. "
"Install it with: pip install ractogateway[rag-ocr-azure]"
) from exc
from azure.core.credentials import AzureKeyCredential
self._client = DocumentIntelligenceClient(
endpoint=self._endpoint,
credential=AzureKeyCredential(self._api_key),
)
return self._client
[docs]
def extract_text(self, image_bytes: bytes, mime_type: str = "image/png") -> str:
try:
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
except ImportError as exc:
raise ImportError(
"azure-ai-documentintelligence is required for "
"AzureDocumentIntelligenceBackend. "
"Install it with: pip install ractogateway[rag-ocr-azure]"
) from exc
client = self._get_client()
poller = client.begin_analyze_document(
self._model_id,
AnalyzeDocumentRequest(bytes_source=image_bytes),
)
result = poller.result()
paragraphs = [p.content for p in (result.paragraphs or []) if p.content]
return "\n".join(paragraphs)