Source code for ractogateway.rag.readers.pdf_reader

"""PDF reader — uses ``pypdf`` (lazy import).

Install with:  pip install ractogateway[rag-pdf]
"""

from __future__ import annotations

from pathlib import Path
from typing import Any


def _require_pypdf() -> Any:
    try:
        import pypdf
    except ImportError as exc:
        raise ImportError(
            "Reading PDF files requires the 'pypdf' package. "
            "Install it with:  pip install ractogateway[rag-pdf]"
        ) from exc
    return pypdf


from ractogateway.rag._models.document import Document
from ractogateway.rag.readers.base import BaseReader


[docs] class PdfReader(BaseReader): """Extract text from PDF files using ``pypdf``. Parameters ---------- extract_images: Reserved for future use — image extraction is not yet supported. """ def __init__(self, extract_images: bool = False) -> None: self._extract_images = extract_images @property def supported_extensions(self) -> frozenset[str]: return frozenset({".pdf"})
[docs] def read(self, path: Path) -> Document: pypdf = _require_pypdf() pages_text: list[str] = [] page_map: list[tuple[int, str]] = [] with pypdf.PdfReader(str(path)) as reader: total_pages = len(reader.pages) for i, page in enumerate(reader.pages): text = page.extract_text() or "" pages_text.append(text) page_map.append((i + 1, text)) full_text = "\n\n".join(t for t in pages_text if t.strip()) return Document( content=full_text, source=str(path.resolve()), metadata={ "extension": ".pdf", "filename": path.name, "size_bytes": path.stat().st_size, "total_pages": total_pages, "page_map": page_map, }, )