Source code for ractogateway.rag.page_index._models

"""Pydantic models for the PageIndexRAG pipeline."""

from __future__ import annotations

import uuid
from typing import Any

from pydantic import BaseModel, Field

from ractogateway.adapters.base import LLMResponse


def _new_id() -> str:
    return str(uuid.uuid4())



[docs]
class PageEntry(BaseModel):
    """A single page (or fixed-size window) extracted from a document.

    Produced by :class:`~ractogateway.rag.page_index.pipeline.PageIndexRAG`
    during ingestion and stored in the in-process index.
    """

    entry_id: str = Field(default_factory=_new_id, description="Auto-generated UUID.")
    page_number: int | None = Field(
        default=None,
        description=(
            "1-based page number for page-aware sources (PDFs, Word docs). "
            "``None`` for sliding-window entries created from plain-text files."
        ),
    )
    content: str = Field(description="Full text of the page (post-processing).")
    source: str = Field(description="Absolute file path or descriptive label.")
    section_title: str | None = Field(
        default=None,
        description="First Markdown-style heading detected on the page, if any.",
    )
    keywords: list[str] = Field(
        default_factory=list,
        description=(
            "Top-N TF-weighted content terms extracted from the page. "
            "Used by the decision index for first-stage candidate selection."
        ),
    )
    doc_id: str = Field(description="UUID of the parent document.")
    char_count: int = Field(description="Length of ``content`` in characters.")
    extra: dict[str, Any] = Field(
        default_factory=dict,
        description="Caller-supplied metadata forwarded from ``ingest()`` kwargs.",
    )
    ocr_applied: bool = Field(
        default=False,
        description="``True`` when this page's text was produced by an OCR backend.",
    )
    ocr_confidence: float | None = Field(
        default=None,
        description=(
            "Mean OCR word confidence (0-100) reported by the backend, "
            "if available.  ``None`` for backends that do not expose confidence."
        ),
    )
    content_hash: str | None = Field(
        default=None,
        description="SHA-256 hex digest of the raw page bytes used for deduplication.",
    )

    @property
    def text(self) -> str:
        """Alias for content."""
        return self.content




[docs]
class PageIndexResult(BaseModel):
    """A single retrieved page together with its BM25 relevance score."""

    entry: PageEntry
    score: float = Field(description="Okapi BM25 score (higher = more relevant).")
    rank: int = Field(description="1-based rank within the current result list.")
    matched_terms: list[str] = Field(
        default_factory=list,
        description="Query tokens that matched this page's content.",
    )

    @property
    def content(self) -> str:
        """Alias for entry.content."""
        return self.entry.content

    @property
    def text(self) -> str:
        """Alias for entry.content."""
        return self.entry.content




[docs]
class PageIndexResponse(BaseModel):
    """Full response from :meth:`PageIndexRAG.query` / :meth:`PageIndexRAG.aquery`."""

    answer: LLMResponse | None = Field(
        default=None,
        description=(
            "Generated answer from the LLM kit. "
            "``None`` when no ``llm_kit`` is configured on the pipeline."
        ),
    )
    sources: list[PageIndexResult] = Field(
        default_factory=list,
        description="Retrieved pages ranked by BM25 relevance.",
    )
    query: str = Field(description="The original question / query string.")
    context_used: str = Field(
        default="",
        description="Formatted context block that was supplied to the LLM.",
    )

    @property
    def results(self) -> list[PageIndexResult]:
        """Alias for sources."""
        return self.sources

    @property
    def pages(self) -> list[PageIndexResult]:
        """Alias for sources."""
        return self.sources

    model_config = {"arbitrary_types_allowed": True}