feat(agents): implement agent framework and CLI

2025-03-08 15:52:29 +00:00
parent 72268ff440
commit f22ca1d5bd
30 changed files with 3466 additions and 0 deletions
--- a/tests/test_agents.py
+++ b/tests/test_agents.py
@@ -0,0 +1,305 @@
+"""Tests for review agents."""
+
+import pytest
+
+from arbiter.agents import ComplexityAgent, ReviewContext, SecurityAgent, StyleAgent
+from arbiter.llm.prompts import PromptRegistry
+from arbiter.models import AgentConfig, AgentName, Policy, Severity
+from tests.conftest import MockLLMClient
+
+
+class TestSecurityAgent:
+    @pytest.mark.asyncio
+    async def test_review_returns_result(
+        self,
+        prompt_registry: PromptRegistry,
+    ) -> None:
+        mock_llm = MockLLMClient(responses=["[]"])
+        agent = SecurityAgent(mock_llm, prompt_registry)
+        context = ReviewContext(diff="+ some code", policy=Policy())
+
+        result = await agent.review(context)
+
+        assert result.agent_name == AgentName.SECURITY
+        assert result.findings == []
+        assert result.duration_ms >= 0
+        assert result.tokens_used == 150  # 100 in + 50 out from mock
+        assert result.cost_usd == 0.001
+
+    @pytest.mark.asyncio
+    async def test_parses_json_findings(
+        self,
+        prompt_registry: PromptRegistry,
+    ) -> None:
+        response = """```json
+[
+  {
+    "file": "src/auth.py",
+    "line_start": 10,
+    "line_end": 15,
+    "severity": "high",
+    "confidence": 0.9,
+    "title": "SQL Injection",
+    "description": "User input concatenated",
+    "reasoning": "Allows SQL injection",
+    "suggestion": "Use parameterized queries",
+    "references": ["https://owasp.org"]
+  }
+]
+```"""
+        mock_llm = MockLLMClient(responses=[response])
+        agent = SecurityAgent(mock_llm, prompt_registry)
+        context = ReviewContext(diff="+ query = ...", policy=Policy())
+
+        result = await agent.review(context)
+
+        assert len(result.findings) == 1
+        finding = result.findings[0]
+        assert finding.file == "src/auth.py"
+        assert finding.severity == Severity.HIGH
+        assert finding.confidence == 0.9
+        assert finding.title == "SQL Injection"
+
+    @pytest.mark.asyncio
+    async def test_uses_configured_model(
+        self,
+        prompt_registry: PromptRegistry,
+    ) -> None:
+        mock_llm = MockLLMClient(responses=["[]"])
+        agent = SecurityAgent(mock_llm, prompt_registry)
+        policy = Policy(
+            agents={
+                AgentName.SECURITY: AgentConfig(model="gpt-4o-mini"),
+                AgentName.STYLE: AgentConfig(),
+                AgentName.COMPLEXITY: AgentConfig(),
+            }
+        )
+        context = ReviewContext(diff="+ code", policy=policy)
+
+        await agent.review(context)
+
+        assert mock_llm.calls[0]["model"] == "gpt-4o-mini"
+
+    @pytest.mark.asyncio
+    async def test_filters_by_severity(
+        self,
+        prompt_registry: PromptRegistry,
+    ) -> None:
+        response = """[
+  {"file": "a.py", "line_start": 1, "line_end": 1, "severity": "high", "confidence": 0.9, "title": "High", "description": "", "reasoning": ""},
+  {"file": "b.py", "line_start": 1, "line_end": 1, "severity": "low", "confidence": 0.9, "title": "Low", "description": "", "reasoning": ""},
+  {"file": "c.py", "line_start": 1, "line_end": 1, "severity": "info", "confidence": 0.9, "title": "Info", "description": "", "reasoning": ""}
+]"""
+        mock_llm = MockLLMClient(responses=[response])
+        agent = SecurityAgent(mock_llm, prompt_registry)
+        policy = Policy(
+            agents={
+                AgentName.SECURITY: AgentConfig(severity_threshold=Severity.MEDIUM),
+                AgentName.STYLE: AgentConfig(),
+                AgentName.COMPLEXITY: AgentConfig(),
+            }
+        )
+        context = ReviewContext(diff="+ code", policy=policy)
+
+        result = await agent.review(context)
+
+        # Only high severity should pass (medium threshold filters low and info)
+        assert len(result.findings) == 1
+        assert result.findings[0].severity == Severity.HIGH
+
+
+class TestStyleAgent:
+    @pytest.mark.asyncio
+    async def test_review_returns_result(
+        self,
+        prompt_registry: PromptRegistry,
+    ) -> None:
+        mock_llm = MockLLMClient(responses=["[]"])
+        agent = StyleAgent(mock_llm, prompt_registry)
+        context = ReviewContext(diff="+ some code", policy=Policy())
+
+        result = await agent.review(context)
+
+        assert result.agent_name == AgentName.STYLE
+        assert result.findings == []
+
+    @pytest.mark.asyncio
+    async def test_uses_default_model(
+        self,
+        prompt_registry: PromptRegistry,
+    ) -> None:
+        mock_llm = MockLLMClient(responses=["[]"])
+        agent = StyleAgent(mock_llm, prompt_registry)
+        context = ReviewContext(diff="+ code", policy=Policy())
+
+        await agent.review(context)
+
+        assert mock_llm.calls[0]["model"] == "gpt-4o-mini"
+
+
+class TestComplexityAgent:
+    @pytest.mark.asyncio
+    async def test_review_returns_result(
+        self,
+        prompt_registry: PromptRegistry,
+    ) -> None:
+        mock_llm = MockLLMClient(responses=["[]"])
+        agent = ComplexityAgent(mock_llm, prompt_registry)
+        context = ReviewContext(diff="+ some code", policy=Policy())
+
+        result = await agent.review(context)
+
+        assert result.agent_name == AgentName.COMPLEXITY
+        assert result.findings == []
+
+    @pytest.mark.asyncio
+    async def test_parses_complexity_findings(
+        self,
+        prompt_registry: PromptRegistry,
+    ) -> None:
+        response = """[
+  {
+    "file": "processor.py",
+    "line_start": 1,
+    "line_end": 50,
+    "severity": "medium",
+    "confidence": 0.8,
+    "title": "High cyclomatic complexity",
+    "description": "Function has 15 branches",
+    "reasoning": "Makes testing and maintenance difficult"
+  }
+]"""
+        mock_llm = MockLLMClient(responses=[response])
+        agent = ComplexityAgent(mock_llm, prompt_registry)
+        context = ReviewContext(diff="+ complex code", policy=Policy())
+
+        result = await agent.review(context)
+
+        assert len(result.findings) == 1
+        assert result.findings[0].severity == Severity.MEDIUM
+        assert "complexity" in result.findings[0].title.lower()
+
+
+class TestAgentResponseParsing:
+    @pytest.mark.asyncio
+    async def test_handles_empty_response(
+        self,
+        prompt_registry: PromptRegistry,
+    ) -> None:
+        mock_llm = MockLLMClient(responses=[""])
+        agent = SecurityAgent(mock_llm, prompt_registry)
+        context = ReviewContext(diff="+ code", policy=Policy())
+
+        result = await agent.review(context)
+
+        assert result.findings == []
+
+    @pytest.mark.asyncio
+    async def test_handles_invalid_json(
+        self,
+        prompt_registry: PromptRegistry,
+    ) -> None:
+        mock_llm = MockLLMClient(responses=["not valid json"])
+        agent = SecurityAgent(mock_llm, prompt_registry)
+        context = ReviewContext(diff="+ code", policy=Policy())
+
+        result = await agent.review(context)
+
+        assert result.findings == []
+
+    @pytest.mark.asyncio
+    async def test_handles_json_without_code_block(
+        self,
+        prompt_registry: PromptRegistry,
+    ) -> None:
+        response = '[{"file": "a.py", "line_start": 1, "line_end": 1, "severity": "info", "confidence": 0.5, "title": "Test", "description": "", "reasoning": ""}]'
+        mock_llm = MockLLMClient(responses=[response])
+        agent = SecurityAgent(mock_llm, prompt_registry)
+        context = ReviewContext(diff="+ code", policy=Policy())
+
+        result = await agent.review(context)
+
+        assert len(result.findings) == 1
+
+    @pytest.mark.asyncio
+    async def test_handles_malformed_finding(
+        self,
+        prompt_registry: PromptRegistry,
+    ) -> None:
+        response = """[
+  {"file": "a.py", "line_start": 1, "severity": "invalid_severity", "confidence": 0.5, "title": "Bad", "description": "", "reasoning": ""},
+  {"file": "b.py", "line_start": 1, "line_end": 1, "severity": "info", "confidence": 0.5, "title": "Valid", "description": "", "reasoning": ""}
+]"""
+        mock_llm = MockLLMClient(responses=[response])
+        agent = SecurityAgent(mock_llm, prompt_registry)
+        context = ReviewContext(diff="+ code", policy=Policy())
+
+        result = await agent.review(context)
+
+        # Only the valid finding should be included (first has invalid severity)
+        assert len(result.findings) == 1
+        assert result.findings[0].title == "Valid"
+
+    @pytest.mark.asyncio
+    async def test_includes_prompt_additions(
+        self,
+        prompt_registry: PromptRegistry,
+    ) -> None:
+        mock_llm = MockLLMClient(responses=["[]"])
+        agent = SecurityAgent(mock_llm, prompt_registry)
+        policy = Policy(
+            agents={
+                AgentName.SECURITY: AgentConfig(prompt_additions="Focus on authentication"),
+                AgentName.STYLE: AgentConfig(),
+                AgentName.COMPLEXITY: AgentConfig(),
+            }
+        )
+        context = ReviewContext(diff="+ code", policy=policy)
+
+        await agent.review(context)
+
+        message_content = mock_llm.calls[0]["messages"][0]["content"]
+        assert "Focus on authentication" in message_content
+
+    @pytest.mark.asyncio
+    async def test_handles_non_list_json(
+        self,
+        prompt_registry: PromptRegistry,
+    ) -> None:
+        mock_llm = MockLLMClient(responses=['{"not": "a list"}'])
+        agent = SecurityAgent(mock_llm, prompt_registry)
+        context = ReviewContext(diff="+ code", policy=Policy())
+
+        result = await agent.review(context)
+
+        assert result.findings == []
+
+    @pytest.mark.asyncio
+    async def test_handles_non_dict_items(
+        self,
+        prompt_registry: PromptRegistry,
+    ) -> None:
+        mock_llm = MockLLMClient(responses=['["string", 123, null]'])
+        agent = SecurityAgent(mock_llm, prompt_registry)
+        context = ReviewContext(diff="+ code", policy=Policy())
+
+        result = await agent.review(context)
+
+        assert result.findings == []
+
+    @pytest.mark.asyncio
+    async def test_agent_without_config_uses_defaults(
+        self,
+        prompt_registry: PromptRegistry,
+    ) -> None:
+        mock_llm = MockLLMClient(responses=["[]"])
+        agent = SecurityAgent(mock_llm, prompt_registry)
+        # Create policy with empty agents dict
+        policy = Policy(agents={})
+        context = ReviewContext(diff="+ code", policy=policy)
+
+        result = await agent.review(context)
+
+        # Should use default model (gpt-4o for security)
+        assert mock_llm.calls[0]["model"] == "gpt-4o"
+        assert result.findings == []