Source code for ractogateway.cache.exact_cache

"""Exact-match key-value cache with LRU eviction and optional TTL.

Uses ``collections.OrderedDict`` for O(1) get / put / evict — a standard
least-recently-used (LRU) cache pattern.  No external dependencies.

Thread-safety is provided by a ``threading.Lock`` so the cache is safe to
share across threads without any external synchronisation.
"""

from __future__ import annotations

import hashlib
import threading
import time
from collections import OrderedDict

from ractogateway.adapters.base import LLMResponse
from ractogateway.cache._models import CacheConfig, CacheEntry, CacheStats


def _make_key(
    user_message: str,
    system_prompt: str,
    model: str,
    temperature: float,
    max_tokens: int,
) -> str:
    """Build a deterministic SHA-256 cache key from request parameters.

    Hashing avoids key-size bloat while remaining collision-resistant for
    practical workloads.  The digest is hex-encoded (64 chars).
    """
    raw = "\x00".join([user_message, system_prompt, model, str(temperature), str(max_tokens)])
    return hashlib.sha256(raw.encode()).hexdigest()



[docs]
class ExactMatchCache:
    """Ultra-low-latency key-value cache for identical LLM requests.

    Parameters
    ----------
    max_size:
        LRU capacity.  ``0`` = unlimited (no eviction).
    ttl_seconds:
        Entries older than *ttl_seconds* are treated as misses and
        transparently evicted.  ``None`` disables expiry.

    Example::

        from ractogateway.cache import ExactMatchCache

        cache = ExactMatchCache(max_size=512, ttl_seconds=3600)

        # Wire into a kit:
        kit = OpenAIDeveloperKit(model="gpt-4o", exact_cache=cache)
    """

    def __init__(
        self,
        max_size: int = 1024,
        ttl_seconds: float | None = None,
    ) -> None:
        self._config = CacheConfig(max_size=max_size, ttl_seconds=ttl_seconds)
        # OrderedDict: insertion order = LRU order; move_to_end(key) on hit.
        self._store: OrderedDict[str, CacheEntry] = OrderedDict()
        self._lock = threading.Lock()
        self._hits = 0
        self._misses = 0

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------


[docs]
    def get(
        self,
        user_message: str,
        system_prompt: str,
        model: str,
        temperature: float,
        max_tokens: int,
    ) -> LLMResponse | None:
        """Return a cached response or ``None`` on a miss.

        O(1) — dictionary lookup + optional move-to-end.
        """
        key = _make_key(user_message, system_prompt, model, temperature, max_tokens)
        with self._lock:
            entry = self._store.get(key)
            if entry is None:
                self._misses += 1
                return None

            # TTL check
            if self._config.ttl_seconds is not None:
                age = time.monotonic() - entry.created_at
                if age > self._config.ttl_seconds:
                    del self._store[key]
                    self._misses += 1
                    return None

            # Cache hit — promote to most-recently-used position
            self._store.move_to_end(key)
            entry.hit_count += 1
            self._hits += 1
            return entry.response



[docs]
    def put(
        self,
        user_message: str,
        system_prompt: str,
        model: str,
        temperature: float,
        max_tokens: int,
        response: LLMResponse,
    ) -> None:
        """Store a response.  Evicts LRU entry when at capacity.

        O(1) amortised — dictionary insert + optional popitem(last=False).
        """
        key = _make_key(user_message, system_prompt, model, temperature, max_tokens)
        with self._lock:
            if key in self._store:
                # Update in place; promote to MRU end.
                self._store[key].response = response
                self._store[key].created_at = time.monotonic()
                self._store.move_to_end(key)
                return

            entry = CacheEntry(response=response, created_at=time.monotonic())
            self._store[key] = entry
            self._store.move_to_end(key)

            # Evict least-recently-used entry when over capacity
            cap = self._config.max_size
            if cap > 0:
                while len(self._store) > cap:
                    self._store.popitem(last=False)  # O(1) — pop from LRU end



[docs]
    def invalidate(
        self,
        user_message: str,
        system_prompt: str,
        model: str,
        temperature: float,
        max_tokens: int,
    ) -> bool:
        """Remove a specific entry.  Returns ``True`` if it was present."""
        key = _make_key(user_message, system_prompt, model, temperature, max_tokens)
        with self._lock:
            if key in self._store:
                del self._store[key]
                return True
            return False



[docs]
    def clear(self) -> None:
        """Evict all cached entries and reset counters."""
        with self._lock:
            self._store.clear()
            self._hits = 0
            self._misses = 0


    @property
    def stats(self) -> CacheStats:
        """Return a snapshot of hit/miss/size counters."""
        with self._lock:
            return CacheStats(hits=self._hits, misses=self._misses, size=len(self._store))

    def __len__(self) -> int:
        with self._lock:
            return len(self._store)

    def __repr__(self) -> str:  # pragma: no cover
        s = self.stats
        return (
            f"ExactMatchCache(max_size={self._config.max_size}, "
            f"ttl={self._config.ttl_seconds}s, "
            f"size={s.size}, hit_rate={s.hit_rate:.1%})"
        )