add llm response cache (redis)

2025-03-21 18:17:41 +00:00
parent ea2b70f5a3
commit 74432c9f80
4 changed files with 369 additions and 0 deletions
@@ -1,12 +1,15 @@
 """LLM client and prompt management."""

+from arbiter.llm.cache import LLMCache, compute_policy_hash
 from arbiter.llm.client import LiteLLMClient, LLMClient, LLMResponse
 from arbiter.llm.prompts import PromptRegistry, PromptTemplate

 __all__ = [
+    "LLMCache",
    "LLMClient",
    "LLMResponse",
    "LiteLLMClient",
    "PromptRegistry",
    "PromptTemplate",
+    "compute_policy_hash",
 ]
@@ -0,0 +1,210 @@
+"""Redis-backed cache for LLM responses."""
+
+import hashlib
+import json
+import logging
+from typing import Any
+
+from redis.asyncio import Redis
+
+from arbiter.config import get_settings
+from arbiter.llm.client import LLMResponse
+
+logger = logging.getLogger(__name__)
+
+
+class LLMCache:
+    """Redis-backed cache for LLM responses.
+
+    Cache key format: arbiter:llm:cache:{hash}
+    where hash = sha256(diff + agent + prompt_version + policy_hash)
+    """
+
+    PREFIX = "arbiter:llm:cache"
+
+    def __init__(self, redis: Redis) -> None:
+        self.redis = redis
+        self.settings = get_settings()
+        self._hits = 0
+        self._misses = 0
+
+    def _compute_key(
+        self,
+        diff: str,
+        agent: str,
+        prompt_version: str,
+        policy_hash: str | None = None,
+    ) -> str:
+        """Compute cache key from inputs.
+
+        Args:
+            diff: The diff content being reviewed.
+            agent: Agent name.
+            prompt_version: Version of the prompt template.
+            policy_hash: Optional hash of policy configuration.
+
+        Returns:
+            Cache key string.
+        """
+        components = [
+            diff,
+            agent,
+            prompt_version,
+            policy_hash or "default",
+        ]
+        content = "|".join(components)
+        hash_value = hashlib.sha256(content.encode()).hexdigest()
+        return f"{self.PREFIX}:{hash_value}"
+
+    def _serialize_response(self, response: LLMResponse) -> str:
+        """Serialize LLMResponse to JSON string."""
+        return json.dumps(
+            {
+                "content": response.content,
+                "model": response.model,
+                "tokens_in": response.tokens_in,
+                "tokens_out": response.tokens_out,
+                "cost_usd": response.cost_usd,
+            }
+        )
+
+    def _deserialize_response(self, data: str) -> LLMResponse:
+        parsed = json.loads(data)
+        return LLMResponse(
+            content=parsed["content"],
+            model=parsed["model"],
+            tokens_in=parsed["tokens_in"],
+            tokens_out=parsed["tokens_out"],
+            cost_usd=parsed["cost_usd"],
+        )
+
+    async def get(
+        self,
+        diff: str,
+        agent: str,
+        prompt_version: str,
+        policy_hash: str | None = None,
+    ) -> LLMResponse | None:
+        """Get cached LLM response if available.
+
+        Args:
+            diff: The diff content.
+            agent: Agent name.
+            prompt_version: Prompt version.
+            policy_hash: Optional policy hash.
+
+        Returns:
+            Cached LLMResponse or None if not found.
+        """
+        key = self._compute_key(diff, agent, prompt_version, policy_hash)
+
+        try:
+            data = await self.redis.get(key)
+            if data:
+                self._hits += 1
+                logger.debug("Cache hit for %s", key[:50])
+                return self._deserialize_response(data)
+            self._misses += 1
+            return None
+        except Exception as e:
+            logger.warning("Cache get error: %s", e)
+            self._misses += 1
+            return None
+
+    async def set(
+        self,
+        diff: str,
+        agent: str,
+        prompt_version: str,
+        response: LLMResponse,
+        policy_hash: str | None = None,
+    ) -> None:
+        """Cache an LLM response.
+
+        Args:
+            diff: The diff content.
+            agent: Agent name.
+            prompt_version: Prompt version.
+            response: LLM response to cache.
+            policy_hash: Optional policy hash.
+        """
+        key = self._compute_key(diff, agent, prompt_version, policy_hash)
+        ttl_seconds = self.settings.cache_ttl_hours * 3600
+
+        try:
+            serialized = self._serialize_response(response)
+            await self.redis.set(key, serialized, ex=ttl_seconds)
+            logger.debug("Cached response for %s (TTL: %ds)", key[:50], ttl_seconds)
+        except Exception as e:
+            logger.warning("Cache set error: %s", e)
+
+    async def invalidate(
+        self,
+        diff: str,
+        agent: str,
+        prompt_version: str,
+        policy_hash: str | None = None,
+    ) -> bool:
+        """Invalidate a cached response.
+
+        Args:
+            diff: The diff content.
+            agent: Agent name.
+            prompt_version: Prompt version.
+            policy_hash: Optional policy hash.
+
+        Returns:
+            True if a key was deleted.
+        """
+        key = self._compute_key(diff, agent, prompt_version, policy_hash)
+
+        try:
+            deleted: int = await self.redis.delete(key)
+            return deleted > 0
+        except Exception as e:
+            logger.warning("Cache invalidate error: %s", e)
+            return False
+
+    async def clear_agent(self, _agent: str) -> int:
+        """Clear all cached responses for an agent.
+
+        Note: This uses SCAN which may be slow on large datasets.
+
+        Args:
+            agent: Agent name to clear cache for.
+
+        Returns:
+            Number of keys deleted.
+        """
+        pattern = f"{self.PREFIX}:*"
+        deleted = 0
+
+        try:
+            async for key in self.redis.scan_iter(match=pattern):
+                deleted += await self.redis.delete(key)
+            return deleted
+        except Exception as e:
+            logger.warning("Cache clear error: %s", e)
+            return 0
+
+    def get_stats(self) -> dict[str, Any]:
+        """Get cache statistics.
+
+        Returns:
+            Dict with hits, misses, and hit rate.
+        """
+        total = self._hits + self._misses
+        hit_rate = self._hits / total if total > 0 else 0.0
+
+        return {
+            "hits": self._hits,
+            "misses": self._misses,
+            "total": total,
+            "hit_rate": hit_rate,
+        }
+
+
+def compute_policy_hash(policy_dict: dict[str, Any]) -> str:
+    # Sort keys for consistent hashing
+    content = json.dumps(policy_dict, sort_keys=True)
+    return hashlib.sha256(content.encode()).hexdigest()[:16]
@@ -1,5 +1,6 @@
 """Arbiter data models."""

+from arbiter.models.cost import AgentCost, CostEstimate, ReviewCost
 from arbiter.models.enums import AgentName, Severity, Verdict
 from arbiter.models.finding import Finding
 from arbiter.models.policy import AgentConfig, Policy
@@ -7,9 +8,12 @@ from arbiter.models.review import ReviewResult

 __all__ = [
    "AgentConfig",
+    "AgentCost",
    "AgentName",
+    "CostEstimate",
    "Finding",
    "Policy",
+    "ReviewCost",
    "ReviewResult",
    "Severity",
    "Verdict",
@@ -0,0 +1,152 @@
+"""Cost tracking models for Arbiter."""
+
+from pydantic import BaseModel, Field
+
+from arbiter.models.enums import AgentName
+
+
+class AgentCost(BaseModel):
+    """Cost breakdown for a single agent."""
+
+    agent: AgentName = Field(description="Agent name")
+    tokens_in: int = Field(ge=0, default=0, description="Input tokens used")
+    tokens_out: int = Field(ge=0, default=0, description="Output tokens used")
+    total_tokens: int = Field(ge=0, default=0, description="Total tokens used")
+    cost_usd: float = Field(ge=0.0, default=0.0, description="Estimated cost in USD")
+
+
+class ReviewCost(BaseModel):
+    """Complete cost tracking for a review."""
+
+    # Per-agent costs
+    agent_costs: list[AgentCost] = Field(
+        default_factory=list, description="Cost breakdown by agent"
+    )
+
+    # Deliberation costs (synthesis)
+    deliberation_tokens_in: int = Field(ge=0, default=0, description="Deliberation input tokens")
+    deliberation_tokens_out: int = Field(ge=0, default=0, description="Deliberation output tokens")
+    deliberation_cost_usd: float = Field(ge=0.0, default=0.0, description="Deliberation cost")
+
+    # Totals
+    total_tokens_in: int = Field(ge=0, default=0, description="Total input tokens")
+    total_tokens_out: int = Field(ge=0, default=0, description="Total output tokens")
+    total_tokens: int = Field(ge=0, default=0, description="Total tokens")
+    total_cost_usd: float = Field(ge=0.0, default=0.0, description="Total cost in USD")
+
+    # Cache stats
+    cache_hits: int = Field(ge=0, default=0, description="Number of cache hits")
+    cache_misses: int = Field(ge=0, default=0, description="Number of cache misses")
+
+    def add_agent_cost(
+        self,
+        agent: AgentName,
+        tokens_in: int,
+        tokens_out: int,
+        cost_usd: float,
+    ) -> None:
+        """Add cost for an agent.
+
+        Args:
+            agent: Agent name.
+            tokens_in: Input tokens used.
+            tokens_out: Output tokens used.
+            cost_usd: Cost in USD.
+        """
+        self.agent_costs.append(
+            AgentCost(
+                agent=agent,
+                tokens_in=tokens_in,
+                tokens_out=tokens_out,
+                total_tokens=tokens_in + tokens_out,
+                cost_usd=cost_usd,
+            )
+        )
+        self._update_totals()
+
+    def add_deliberation_cost(
+        self,
+        tokens_in: int,
+        tokens_out: int,
+        cost_usd: float,
+    ) -> None:
+        self.deliberation_tokens_in += tokens_in
+        self.deliberation_tokens_out += tokens_out
+        self.deliberation_cost_usd += cost_usd
+        self._update_totals()
+
+    def _update_totals(self) -> None:
+        """Recalculate totals from components."""
+        agent_tokens_in = sum(c.tokens_in for c in self.agent_costs)
+        agent_tokens_out = sum(c.tokens_out for c in self.agent_costs)
+        agent_cost = sum(c.cost_usd for c in self.agent_costs)
+
+        self.total_tokens_in = agent_tokens_in + self.deliberation_tokens_in
+        self.total_tokens_out = agent_tokens_out + self.deliberation_tokens_out
+        self.total_tokens = self.total_tokens_in + self.total_tokens_out
+        self.total_cost_usd = agent_cost + self.deliberation_cost_usd
+
+    def to_agent_dict(self) -> dict[str, int]:
+        return {c.agent.value: c.total_tokens for c in self.agent_costs}
+
+    def to_cost_dict(self) -> dict[str, float]:
+        return {c.agent.value: c.cost_usd for c in self.agent_costs}
+
+    def is_within_budget(self, max_tokens: int, max_cost_usd: float) -> bool:
+        return self.total_tokens <= max_tokens and self.total_cost_usd <= max_cost_usd
+
+
+class CostEstimate(BaseModel):
+    """Pre-review cost estimate."""
+
+    estimated_tokens: int = Field(ge=0, description="Estimated tokens needed")
+    estimated_cost_usd: float = Field(ge=0.0, description="Estimated cost in USD")
+    agents_enabled: list[AgentName] = Field(description="Agents that will run")
+    model: str = Field(description="Model to be used")
+    within_budget: bool = Field(description="Whether estimate is within budget")
+
+    @classmethod
+    def estimate(
+        cls,
+        diff_size: int,
+        agents: list[AgentName],
+        model: str,
+        max_tokens: int = 50000,
+        max_cost_usd: float = 0.50,
+    ) -> "CostEstimate":
+        """Estimate cost for a review.
+
+        This is a rough estimate based on diff size and model pricing.
+
+        Args:
+            diff_size: Size of diff in characters.
+            agents: Agents that will run.
+            model: Model to be used.
+            max_tokens: Maximum allowed tokens.
+            max_cost_usd: Maximum allowed cost.
+
+        Returns:
+            Cost estimate.
+        """
+        # Rough token estimate: ~4 chars per token for input
+        # Each agent typically uses 3x input for output
+        tokens_per_agent = (diff_size // 4) * 4  # input + 3x output
+
+        # Deliberation uses ~20% of agent tokens
+        deliberation_tokens = tokens_per_agent // 5
+
+        total_tokens = (tokens_per_agent * len(agents)) + deliberation_tokens
+
+        # Rough cost estimate based on model
+        # GPT-4o: $5/1M input, $15/1M output (~$10/1M average)
+        # GPT-4o-mini: $0.15/1M input, $0.60/1M output (~$0.40/1M average)
+        cost_per_million = 10.0 if "gpt-4o" in model and "mini" not in model else 0.4
+        estimated_cost = (total_tokens / 1_000_000) * cost_per_million
+
+        return cls(
+            estimated_tokens=total_tokens,
+            estimated_cost_usd=round(estimated_cost, 4),
+            agents_enabled=agents,
+            model=model,
+            within_budget=total_tokens <= max_tokens and estimated_cost <= max_cost_usd,
+        )