Source code for ractogateway.rag.chunkers.semantic_chunker

"""Semantic chunker — splits at embedding-space boundaries.

Uses cosine similarity between adjacent sentence embeddings to detect
topic shifts.  Requires an :class:`~ractogateway.rag.embedders.base.BaseEmbedder`
and NLTK ``sent_tokenize``.

Install with:  pip install ractogateway[rag-nlp]
"""

from __future__ import annotations

import math
from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
    from ractogateway.rag.embedders.base import BaseEmbedder


def _require_nltk() -> Any:
    try:
        import nltk
    except ImportError as exc:
        raise ImportError(
            "SemanticChunker requires the 'nltk' package. "
            "Install it with:  pip install ractogateway[rag-nlp]"
        ) from exc
    try:
        nltk.data.find("tokenizers/punkt")
    except LookupError:
        nltk.download("punkt", quiet=True)
    try:
        nltk.data.find("tokenizers/punkt_tab")
    except LookupError:
        nltk.download("punkt_tab", quiet=True)
    return nltk


from ractogateway.rag._models.document import Chunk, ChunkMetadata, Document
from ractogateway.rag.chunkers.base import BaseChunker


def _cosine(a: list[float], b: list[float]) -> float:
    dot = sum(x * y for x, y in zip(a, b, strict=False))
    norm_a = math.sqrt(sum(x * x for x in a))
    norm_b = math.sqrt(sum(x * x for x in b))
    if norm_a == 0 or norm_b == 0:
        return 0.0
    return dot / (norm_a * norm_b)



[docs]
class SemanticChunker(BaseChunker):
    """Split documents where the semantic similarity between adjacent
    sentences drops below a threshold.

    Parameters
    ----------
    embedder:
        Any :class:`~ractogateway.rag.embedders.base.BaseEmbedder` instance.
    threshold:
        Cosine similarity below which a split is inserted (default: ``0.5``).
    min_chunk_size:
        Minimum number of sentences per chunk (prevents ultra-fine splits).
    language:
        NLTK sentence tokenizer language.
    """

    def __init__(
        self,
        embedder: BaseEmbedder,
        threshold: float = 0.5,
        min_chunk_size: int = 2,
        language: str = "english",
    ) -> None:
        self.embedder = embedder
        self.threshold = threshold
        self.min_chunk_size = min_chunk_size
        self.language = language


[docs]
    def chunk(self, document: Document) -> list[Chunk]:
        nltk = _require_nltk()
        sentences = nltk.sent_tokenize(document.content, language=self.language)

        if len(sentences) <= self.min_chunk_size:
            return self._make_single_chunk(document, sentences)

        # Embed all sentences (batched)
        embeddings = self.embedder.embed(sentences)

        # Detect split points
        split_indices: list[int] = [0]
        current_group_size = 1
        for i in range(1, len(sentences)):
            sim = _cosine(embeddings[i - 1], embeddings[i])
            if sim < self.threshold and current_group_size >= self.min_chunk_size:
                split_indices.append(i)
                current_group_size = 1
            else:
                current_group_size += 1
        split_indices.append(len(sentences))  # sentinel

        # Build chunks from split points
        groups = [
            sentences[split_indices[i] : split_indices[i + 1]]
            for i in range(len(split_indices) - 1)
        ]
        total = len(groups)
        chunks: list[Chunk] = []
        cursor = 0
        for idx, group in enumerate(groups):
            text = " ".join(group)
            start = document.content.find(group[0], cursor)
            if start == -1:
                start = cursor
            end = start + len(text)
            cursor = start
            chunks.append(
                Chunk(
                    doc_id=document.doc_id,
                    content=text,
                    metadata=ChunkMetadata(
                        source=document.source,
                        chunk_index=idx,
                        total_chunks=total,
                        start_char=start,
                        end_char=end,
                        doc_id=document.doc_id,
                        extra=dict(document.metadata),
                    ),
                )
            )
        return chunks


    def _make_single_chunk(self, document: Document, sentences: list[str]) -> list[Chunk]:
        text = " ".join(sentences)
        return [
            Chunk(
                doc_id=document.doc_id,
                content=text,
                metadata=ChunkMetadata(
                    source=document.source,
                    chunk_index=0,
                    total_chunks=1,
                    start_char=0,
                    end_char=len(text),
                    doc_id=document.doc_id,
                    extra=dict(document.metadata),
                ),
            )
        ]