Source code for ractogateway.rag.processors.cleaner

"""Text cleaning processor — no extra dependencies."""

from __future__ import annotations

import re
import unicodedata

from ractogateway.rag.processors.base import BaseProcessor

# Control characters (except newline/tab)
_CONTROL_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]")
# Multiple blank lines → double newline
_BLANK_LINES_RE = re.compile(r"\n{3,}")
# Multiple spaces (not newlines) → single space
_MULTI_SPACE_RE = re.compile(r"[ \t]+")
# Residual HTML tags
_HTML_TAG_RE = re.compile(r"<[^>]+>")



[docs]
class TextCleaner(BaseProcessor):
    """Normalise text for embedding and retrieval.

    Steps applied (all optional via constructor flags):

    1. Unicode normalisation (NFC)
    2. Strip residual HTML tags
    3. Remove control characters
    4. Collapse multiple spaces to one
    5. Collapse runs of blank lines to at most two newlines
    6. Strip leading/trailing whitespace

    Parameters
    ----------
    normalize_unicode:
        Apply ``unicodedata.normalize("NFC", text)``.
    strip_html:
        Remove ``<tag>`` patterns.
    strip_control_chars:
        Remove non-printable control characters.
    collapse_whitespace:
        Collapse sequences of spaces/tabs to a single space.
    collapse_blank_lines:
        Collapse 3+ consecutive newlines to 2.
    """

    def __init__(
        self,
        normalize_unicode: bool = True,
        strip_html: bool = True,
        strip_control_chars: bool = True,
        collapse_whitespace: bool = True,
        collapse_blank_lines: bool = True,
    ) -> None:
        self.normalize_unicode = normalize_unicode
        self.strip_html = strip_html
        self.strip_control_chars = strip_control_chars
        self.collapse_whitespace = collapse_whitespace
        self.collapse_blank_lines = collapse_blank_lines


[docs]
    def process(self, text: str) -> str:
        if self.normalize_unicode:
            text = unicodedata.normalize("NFC", text)
        if self.strip_html:
            text = _HTML_TAG_RE.sub(" ", text)
        if self.strip_control_chars:
            text = _CONTROL_RE.sub("", text)
        if self.collapse_whitespace:
            text = _MULTI_SPACE_RE.sub(" ", text)
        if self.collapse_blank_lines:
            text = _BLANK_LINES_RE.sub("\n\n", text)
        return text.strip()