Source code for ractogateway.rag.chunkers.semantic_chunker

"""Semantic chunker — splits at embedding-space boundaries.

Uses cosine similarity between adjacent sentence embeddings to detect
topic shifts.  Requires an :class:`~ractogateway.rag.embedders.base.BaseEmbedder`
and NLTK ``sent_tokenize``.

Install with:  pip install ractogateway[rag-nlp]
"""

from __future__ import annotations

import math
from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
    from ractogateway.rag.embedders.base import BaseEmbedder


def _require_nltk() -> Any:
    try:
        import nltk
    except ImportError as exc:
        raise ImportError(
            "SemanticChunker requires the 'nltk' package. "
            "Install it with:  pip install ractogateway[rag-nlp]"
        ) from exc
    try:
        nltk.data.find("tokenizers/punkt")
    except LookupError:
        nltk.download("punkt", quiet=True)
    try:
        nltk.data.find("tokenizers/punkt_tab")
    except LookupError:
        nltk.download("punkt_tab", quiet=True)
    return nltk


from ractogateway.rag._models.document import Chunk, ChunkMetadata, Document
from ractogateway.rag.chunkers.base import BaseChunker


def _cosine(a: list[float], b: list[float]) -> float:
    dot = sum(x * y for x, y in zip(a, b, strict=False))
    norm_a = math.sqrt(sum(x * x for x in a))
    norm_b = math.sqrt(sum(x * x for x in b))
    if norm_a == 0 or norm_b == 0:
        return 0.0
    return dot / (norm_a * norm_b)


[docs] class SemanticChunker(BaseChunker): """Split documents where the semantic similarity between adjacent sentences drops below a threshold. Parameters ---------- embedder: Any :class:`~ractogateway.rag.embedders.base.BaseEmbedder` instance. threshold: Cosine similarity below which a split is inserted (default: ``0.5``). min_chunk_size: Minimum number of sentences per chunk (prevents ultra-fine splits). language: NLTK sentence tokenizer language. """ def __init__( self, embedder: BaseEmbedder, threshold: float = 0.5, min_chunk_size: int = 2, language: str = "english", ) -> None: self.embedder = embedder self.threshold = threshold self.min_chunk_size = min_chunk_size self.language = language
[docs] def chunk(self, document: Document) -> list[Chunk]: nltk = _require_nltk() sentences = nltk.sent_tokenize(document.content, language=self.language) if len(sentences) <= self.min_chunk_size: return self._make_single_chunk(document, sentences) # Embed all sentences (batched) embeddings = self.embedder.embed(sentences) # Detect split points split_indices: list[int] = [0] current_group_size = 1 for i in range(1, len(sentences)): sim = _cosine(embeddings[i - 1], embeddings[i]) if sim < self.threshold and current_group_size >= self.min_chunk_size: split_indices.append(i) current_group_size = 1 else: current_group_size += 1 split_indices.append(len(sentences)) # sentinel # Build chunks from split points groups = [ sentences[split_indices[i] : split_indices[i + 1]] for i in range(len(split_indices) - 1) ] total = len(groups) chunks: list[Chunk] = [] cursor = 0 for idx, group in enumerate(groups): text = " ".join(group) start = document.content.find(group[0], cursor) if start == -1: start = cursor end = start + len(text) cursor = start chunks.append( Chunk( doc_id=document.doc_id, content=text, metadata=ChunkMetadata( source=document.source, chunk_index=idx, total_chunks=total, start_char=start, end_char=end, doc_id=document.doc_id, extra=dict(document.metadata), ), ) ) return chunks
def _make_single_chunk(self, document: Document, sentences: list[str]) -> list[Chunk]: text = " ".join(sentences) return [ Chunk( doc_id=document.doc_id, content=text, metadata=ChunkMetadata( source=document.source, chunk_index=0, total_chunks=1, start_char=0, end_char=len(text), doc_id=document.doc_id, extra=dict(document.metadata), ), ) ]