"""Pydantic models for the PageIndexRAG pipeline."""
from __future__ import annotations
import uuid
from typing import Any
from pydantic import BaseModel, Field
from ractogateway.adapters.base import LLMResponse
def _new_id() -> str:
return str(uuid.uuid4())
[docs]
class PageEntry(BaseModel):
"""A single page (or fixed-size window) extracted from a document.
Produced by :class:`~ractogateway.rag.page_index.pipeline.PageIndexRAG`
during ingestion and stored in the in-process index.
"""
entry_id: str = Field(default_factory=_new_id, description="Auto-generated UUID.")
page_number: int | None = Field(
default=None,
description=(
"1-based page number for page-aware sources (PDFs, Word docs). "
"``None`` for sliding-window entries created from plain-text files."
),
)
content: str = Field(description="Full text of the page (post-processing).")
source: str = Field(description="Absolute file path or descriptive label.")
section_title: str | None = Field(
default=None,
description="First Markdown-style heading detected on the page, if any.",
)
keywords: list[str] = Field(
default_factory=list,
description=(
"Top-N TF-weighted content terms extracted from the page. "
"Used by the decision index for first-stage candidate selection."
),
)
doc_id: str = Field(description="UUID of the parent document.")
char_count: int = Field(description="Length of ``content`` in characters.")
extra: dict[str, Any] = Field(
default_factory=dict,
description="Caller-supplied metadata forwarded from ``ingest()`` kwargs.",
)
ocr_applied: bool = Field(
default=False,
description="``True`` when this page's text was produced by an OCR backend.",
)
ocr_confidence: float | None = Field(
default=None,
description=(
"Mean OCR word confidence (0-100) reported by the backend, "
"if available. ``None`` for backends that do not expose confidence."
),
)
content_hash: str | None = Field(
default=None,
description="SHA-256 hex digest of the raw page bytes used for deduplication.",
)
@property
def text(self) -> str:
"""Alias for content."""
return self.content
[docs]
class PageIndexResult(BaseModel):
"""A single retrieved page together with its BM25 relevance score."""
entry: PageEntry
score: float = Field(description="Okapi BM25 score (higher = more relevant).")
rank: int = Field(description="1-based rank within the current result list.")
matched_terms: list[str] = Field(
default_factory=list,
description="Query tokens that matched this page's content.",
)
@property
def content(self) -> str:
"""Alias for entry.content."""
return self.entry.content
@property
def text(self) -> str:
"""Alias for entry.content."""
return self.entry.content
[docs]
class PageIndexResponse(BaseModel):
"""Full response from :meth:`PageIndexRAG.query` / :meth:`PageIndexRAG.aquery`."""
answer: LLMResponse | None = Field(
default=None,
description=(
"Generated answer from the LLM kit. "
"``None`` when no ``llm_kit`` is configured on the pipeline."
),
)
sources: list[PageIndexResult] = Field(
default_factory=list,
description="Retrieved pages ranked by BM25 relevance.",
)
query: str = Field(description="The original question / query string.")
context_used: str = Field(
default="",
description="Formatted context block that was supplied to the LLM.",
)
@property
def results(self) -> list[PageIndexResult]:
"""Alias for sources."""
return self.sources
@property
def pages(self) -> list[PageIndexResult]:
"""Alias for sources."""
return self.sources
model_config = {"arbitrary_types_allowed": True}