"""Core document and chunk models for RAG.
Every piece of content in the RAG pipeline is represented as a ``Document``
(raw, as loaded from a file) or a ``Chunk`` (a processed, embeddable slice of
a document). Both are strict Pydantic models with no unvalidated fields.
"""
from __future__ import annotations
import uuid
from typing import Any
from pydantic import BaseModel, Field
def _new_id() -> str:
return str(uuid.uuid4())
[docs]
class Document(BaseModel):
"""A raw document loaded from a file or supplied as plain text.
Parameters
----------
content:
The full extracted text of the document.
source:
Absolute file path, URL, or a descriptive label (e.g. ``"manual"``).
metadata:
Free-form key/value pairs (file size, author, MIME type, …).
doc_id:
Auto-generated UUID; override only when you need stable IDs.
"""
doc_id: str = Field(default_factory=_new_id)
content: str = Field(description="Full extracted text content of the document.")
source: str = Field(description="Absolute path, URL, or label that identifies the source.")
metadata: dict[str, Any] = Field(default_factory=dict)
[docs]
class Chunk(BaseModel):
"""A single embeddable slice of a document.
Produced by a :class:`~ractogateway.rag.chunkers.base.BaseChunker` and
enriched with an embedding vector by a
:class:`~ractogateway.rag.embedders.base.BaseEmbedder`.
"""
chunk_id: str = Field(default_factory=_new_id)
doc_id: str = Field(description="UUID of the parent Document.")
content: str = Field(description="Text content of this chunk (post-processing).")
embedding: list[float] | None = Field(
default=None,
description="Dense embedding vector; populated after embed() is called.",
)
metadata: ChunkMetadata