Source code for ractogateway.rag.readers.word_reader

"""Word document reader — uses ``python-docx`` (lazy import).

Install with:  pip install ractogateway[rag-word]
"""

from __future__ import annotations

import io
from pathlib import Path
from typing import Any


def _require_docx() -> Any:
    try:
        import docx
    except ImportError as exc:
        raise ImportError(
            "Reading Word (.docx) files requires the 'python-docx' package. "
            "Install it with:  pip install ractogateway[rag-word]"
        ) from exc
    return docx


from ractogateway.rag._models.document import Document
from ractogateway.rag.readers.base import BaseReader


[docs] class WordReader(BaseReader): """Extract text from Microsoft Word (.docx) files using ``python-docx``. Accepts a file path (``str`` / ``Path``), raw ``bytes``, or any binary file-like object with a ``.read()`` method. """ @property def supported_extensions(self) -> frozenset[str]: return frozenset({".docx"}) def _read_path(self, path: Path) -> Document: docx = _require_docx() doc = docx.Document(str(path)) paragraphs, tables_text = self._extract_content(doc) full_text = "\n\n".join(paragraphs + tables_text) core_props = doc.core_properties return Document( content=full_text, source=str(path.resolve()), metadata={ "extension": ".docx", "filename": path.name, "size_bytes": path.stat().st_size, "author": core_props.author or "", "title": core_props.title or "", "paragraph_count": len(paragraphs), }, ) def _read_bytes(self, data: bytes, *, source_label: str = "<bytes>") -> Document: docx = _require_docx() doc = docx.Document(io.BytesIO(data)) paragraphs, tables_text = self._extract_content(doc) full_text = "\n\n".join(paragraphs + tables_text) core_props = doc.core_properties return Document( content=full_text, source=source_label, metadata={ "size_bytes": len(data), "author": core_props.author or "", "title": core_props.title or "", "paragraph_count": len(paragraphs), }, ) def _extract_content(self, doc: Any) -> tuple[list[str], list[str]]: paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] tables_text: list[str] = [] for table in doc.tables: for row in table.rows: row_text = "\t".join( cell.text.strip() for cell in row.cells if cell.text.strip() ) if row_text: tables_text.append(row_text) return paragraphs, tables_text