Source code for ractogateway.rag.readers.text_reader

"""Plain-text reader — handles .txt, .md, .rst, .log and similar files."""

from __future__ import annotations

from pathlib import Path

from ractogateway.rag._models.document import Document
from ractogateway.rag.readers.base import BaseReader


[docs] class TextReader(BaseReader): """Read any UTF-8 (or latin-1 fallback) plain-text file. No external dependencies required. Accepts a file path (``str`` / ``Path``), raw ``bytes``, or any binary file-like object with a ``.read()`` method. Parameters ---------- encoding: Primary encoding to try. Falls back to ``"latin-1"`` on error. """ def __init__(self, encoding: str = "utf-8") -> None: self._encoding = encoding @property def supported_extensions(self) -> frozenset[str]: return frozenset( { ".txt", ".md", ".markdown", ".rst", ".log", ".ini", ".cfg", ".toml", ".yaml", ".yml", ".json", ".jsonl", ".xml", ".tex", } ) def _read_path(self, path: Path) -> Document: try: content = path.read_text(encoding=self._encoding) except UnicodeDecodeError: content = path.read_text(encoding="latin-1") return Document( content=content, source=str(path.resolve()), metadata={ "extension": path.suffix.lower(), "filename": path.name, "size_bytes": path.stat().st_size, }, ) def _read_bytes(self, data: bytes, *, source_label: str = "<bytes>") -> Document: try: content = data.decode(self._encoding) except UnicodeDecodeError: content = data.decode("latin-1") return Document( content=content, source=source_label, metadata={"size_bytes": len(data)}, )