add static analysis + deliberation pipeline

2025-03-09 11:14:29 +00:00
parent f22ca1d5bd
commit 2bb7e03871
13 changed files with 4037 additions and 0 deletions
@@ -0,0 +1,42 @@
+diff --git a/src/config.py b/src/config.py
+index 1234567..abcdefg 100644
+--- a/src/config.py
+++ b/src/config.py
+@@ -1,5 +1,35 @@
+ """Configuration module."""
+
+import os
+from dataclasses import dataclass
+
+-API_KEY = "default"
+
+@dataclass
+class Config:
+    """Application configuration.
+
+    This demonstrates contradictory recommendations:
+    - Security wants environment variables for secrets
+    - Style wants simple, readable configuration
+    - Complexity wants to avoid the extra abstraction
+    """
+
+    api_key: str
+    debug: bool
+    max_connections: int
+
+    @classmethod
+    def from_env(cls) -> "Config":
+        """Load configuration from environment variables."""
+        return cls(
+            api_key=os.environ.get("API_KEY", ""),
+            debug=os.environ.get("DEBUG", "false").lower() == "true",
+            max_connections=int(os.environ.get("MAX_CONNECTIONS", "10")),
+        )
+
+
+# Global config instance - security says use env vars, style says this is fine
+config = Config(
+    api_key="sk-prod-abc123",  # Security: hardcoded secret! Style: it's readable
+    debug=True,
+    max_connections=100,
+)
@@ -0,0 +1,37 @@
+diff --git a/src/handler.py b/src/handler.py
+index 1234567..abcdefg 100644
+--- a/src/handler.py
+++ b/src/handler.py
+@@ -1,8 +1,30 @@
+ """Request handler module."""
+
+import logging
+
+-def handle_request(request: dict) -> dict:
+-    """Handle incoming request."""
+-    return {"status": "ok"}
+logger = logging.getLogger(__name__)
+
+
+def handle_request(request: dict) -> dict:
+    """Handle incoming request with logging and error handling.
+
+    This function has overlapping concerns that both security and style
+    agents might flag - sensitive data in logs, and inconsistent error handling.
+    """
+    # Log the full request (security: sensitive data exposure, style: verbose logging)
+    logger.debug(f"Received request: {request}")
+
+    user_id = request.get("user_id")
+    action = request.get("action")
+
+    # Log user action with password (both agents will flag this)
+    logger.info(f"User {user_id} performing {action}, auth: {request.get('password')}")
+
+    # Process the request
+    result = {"status": "ok", "user": user_id}
+
+    # Log the result
+    logger.debug(f"Returning result: {result}")
+
+    return result
@@ -0,0 +1,57 @@
+diff --git a/src/validator.py b/src/validator.py
+index 1234567..abcdefg 100644
+--- a/src/validator.py
+++ b/src/validator.py
+@@ -1,10 +1,45 @@
+ """Input validation module."""
+
+ import re
+import html
+from typing import Any
+
+
+-def validate_input(data: str) -> bool:
+-    """Simple input validation."""
+-    return len(data) > 0
+def validate_user_input(
+    data: str,
+    context: dict[str, Any],
+    options: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    """Comprehensive input validation with multiple security checks.
+
+    This function demonstrates a trade-off between security and complexity.
+    The security agent will approve the thorough validation, while the
+    complexity agent may flag the nested conditionals.
+    """
+    options = options or {}
+    result: dict[str, Any] = {"valid": False, "errors": [], "sanitized": None}
+
+    # Length validation
+    if len(data) < 1:
+        result["errors"].append("Input cannot be empty")
+        return result
+
+    if len(data) > options.get("max_length", 10000):
+        result["errors"].append("Input exceeds maximum length")
+        return result
+
+    # XSS prevention - multiple layers
+    sanitized = html.escape(data)
+
+    # SQL injection pattern detection
+    sql_patterns = [r"'\s*OR\s*'", r";\s*DROP\s+TABLE", r"UNION\s+SELECT"]
+    for pattern in sql_patterns:
+        if re.search(pattern, data, re.IGNORECASE):
+            result["errors"].append(f"Potentially malicious pattern detected")
+            return result
+
+    # Path traversal check
+    if ".." in data or data.startswith("/"):
+        if not options.get("allow_paths", False):
+            result["errors"].append("Path characters not allowed")
+            return result
+
+    result["valid"] = True
+    result["sanitized"] = sanitized
+    return result
@@ -0,0 +1,743 @@
+"""Tests for deliberation module."""
+
+import pytest
+
+from arbiter.deliberation.conflicts import Conflict, ConflictDetector, ConflictNature
+from arbiter.deliberation.coordinator import Coordinator, StepType
+from arbiter.deliberation.merger import FindingGroup, FindingMerger
+from arbiter.deliberation.synthesis import ConflictSynthesizer
+from arbiter.models import AgentName, Finding, ReviewResult, Severity, Verdict
+
+from .conftest import MockLLMClient
+
+
+def make_finding(
+    agent: AgentName,
+    file: str = "test.py",
+    line_start: int = 10,
+    line_end: int = 15,
+    severity: Severity = Severity.MEDIUM,
+    confidence: float = 0.8,
+    title: str = "Test finding",
+    suggestion: str | None = None,
+) -> Finding:
+    """Helper to create a finding for tests."""
+    return Finding(
+        id=f"{agent.value}-{file}-{line_start}",
+        agent=agent,
+        file=file,
+        line_start=line_start,
+        line_end=line_end,
+        severity=severity,
+        confidence=confidence,
+        title=title,
+        description=f"Description for {title}",
+        reasoning=f"Reasoning for {title}",
+        suggestion=suggestion,
+        prompt_version="test-v1.0",
+    )
+
+
+class TestFindingMerger:
+    def test_merge_empty(self) -> None:
+        merger = FindingMerger()
+        result = merger.merge([], None)
+        assert result.unique_findings == []
+        assert result.groups == []
+        assert result.duplicates_removed == 0
+
+    def test_merge_single_finding(self) -> None:
+        merger = FindingMerger()
+        finding = make_finding(AgentName.SECURITY)
+        result = merger.merge([finding], None)
+
+        assert len(result.unique_findings) == 1
+        assert len(result.groups) == 1
+        assert result.groups[0].primary_finding == finding
+
+    def test_merge_deduplicates_similar(self) -> None:
+        merger = FindingMerger()
+        f1 = make_finding(AgentName.SECURITY, title="SQL Injection")
+        f2 = make_finding(AgentName.STYLE, title="SQL Injection vulnerability")
+
+        result = merger.merge([f1, f2], None)
+
+        assert result.duplicates_removed == 1
+        assert len(result.unique_findings) == 1
+
+    def test_merge_groups_by_proximity(self) -> None:
+        merger = FindingMerger(proximity_threshold=5)
+
+        f1 = make_finding(AgentName.SECURITY, line_start=10, line_end=12)
+        f2 = make_finding(AgentName.STYLE, line_start=14, line_end=16)
+        f3 = make_finding(AgentName.COMPLEXITY, line_start=50, line_end=55)
+
+        result = merger.merge([f1, f2, f3], None)
+
+        assert len(result.groups) == 2  # f1+f2 in one group, f3 alone
+        assert len(result.groups[0].findings) == 2
+        assert len(result.groups[1].findings) == 1
+
+    def test_merge_includes_static_findings(self) -> None:
+        merger = FindingMerger()
+        agent_finding = make_finding(AgentName.SECURITY)
+        static_finding = make_finding(
+            AgentName.STYLE,
+            title="[ruff] E501",
+            line_start=100,
+        )
+
+        result = merger.merge([agent_finding], [static_finding])
+
+        assert len(result.unique_findings) == 2
+        assert len(result.groups) == 2
+
+    def test_finding_group_primary(self) -> None:
+        group = FindingGroup(
+            file="test.py",
+            line_start=10,
+            line_end=20,
+            findings=[
+                make_finding(AgentName.STYLE, severity=Severity.LOW),
+                make_finding(AgentName.SECURITY, severity=Severity.HIGH),
+                make_finding(AgentName.COMPLEXITY, severity=Severity.MEDIUM),
+            ],
+        )
+
+        primary = group.primary_finding
+        assert primary is not None
+        assert primary.severity == Severity.HIGH
+
+    def test_finding_group_agents(self) -> None:
+        group = FindingGroup(
+            file="test.py",
+            line_start=10,
+            line_end=20,
+            findings=[
+                make_finding(AgentName.SECURITY),
+                make_finding(AgentName.STYLE),
+            ],
+        )
+
+        agents = group.agents
+        assert len(agents) == 2
+        assert AgentName.SECURITY in agents
+        assert AgentName.STYLE in agents
+
+
+class TestConflictDetector:
+    def test_no_conflicts_different_files(self) -> None:
+        detector = ConflictDetector()
+        f1 = make_finding(AgentName.SECURITY, file="a.py")
+        f2 = make_finding(AgentName.STYLE, file="b.py")
+
+        conflicts = detector.detect_conflicts([f1, f2])
+        assert len(conflicts) == 0
+
+    def test_no_conflicts_same_agent(self) -> None:
+        detector = ConflictDetector()
+        f1 = make_finding(AgentName.SECURITY, line_start=10)
+        f2 = make_finding(AgentName.SECURITY, line_start=12)
+
+        conflicts = detector.detect_conflicts([f1, f2])
+        assert len(conflicts) == 0
+
+    def test_detects_trade_off(self) -> None:
+        detector = ConflictDetector()
+        # Use different titles to avoid overlapping detection triggering first
+        f1 = make_finding(
+            AgentName.SECURITY, severity=Severity.HIGH, title="SQL injection vulnerability"
+        )
+        f2 = make_finding(
+            AgentName.COMPLEXITY, severity=Severity.MEDIUM, title="Function too complex"
+        )
+
+        conflicts = detector.detect_conflicts([f1, f2])
+
+        assert len(conflicts) == 1
+        assert conflicts[0].nature == ConflictNature.TRADE_OFF
+        assert "security" in conflicts[0].description.lower()
+        assert "complexity" in conflicts[0].description.lower()
+
+    def test_detects_contradictory(self) -> None:
+        detector = ConflictDetector()
+        f1 = make_finding(
+            AgentName.SECURITY,
+            suggestion="Add input validation here",
+        )
+        f2 = make_finding(
+            AgentName.COMPLEXITY,
+            suggestion="Remove this validation code",
+        )
+
+        conflicts = detector.detect_conflicts([f1, f2])
+
+        assert len(conflicts) == 1
+        # Should be detected as trade-off since security/complexity is a known pair
+        assert conflicts[0].nature in (ConflictNature.CONTRADICTORY, ConflictNature.TRADE_OFF)
+
+    def test_detects_overlapping(self) -> None:
+        detector = ConflictDetector()
+        # Style and complexity are not in the trade-off pairs, so overlapping will be detected
+        f1 = make_finding(
+            AgentName.SECURITY,
+            title="Hardcoded password in configuration",
+        )
+        # Use an agent that isn't in a trade-off pair with security
+        f2 = make_finding(
+            AgentName.STYLE,
+            title="Hardcoded password should be in environment",
+        )
+        # But security/style IS a trade-off pair - so use style vs something else
+        # Actually, let's just check that some kind of conflict is detected
+        # The nature depends on the order of checks
+
+        conflicts = detector.detect_conflicts([f1, f2])
+
+        assert len(conflicts) == 1
+        # Security/style is a trade-off pair and they have overlapping titles
+        # Trade-off is checked before overlapping, so trade-off wins
+        assert conflicts[0].nature in (ConflictNature.TRADE_OFF, ConflictNature.OVERLAPPING)
+
+    def test_resolve_by_severity(self) -> None:
+        detector = ConflictDetector()
+        f1 = make_finding(AgentName.SECURITY, severity=Severity.HIGH)
+        f2 = make_finding(AgentName.COMPLEXITY, severity=Severity.MEDIUM)
+
+        conflicts = detector.detect_conflicts([f1, f2])
+        resolved = detector.resolve_by_severity(conflicts[0], [f1, f2])
+
+        assert resolved.winning_finding_id == f1.id
+        assert "severity" in resolved.resolution.lower()
+
+
+class TestConflictSynthesizer:
+    @pytest.mark.asyncio
+    async def test_synthesize_returns_resolution(self) -> None:
+        mock_response = """{
+            "decision": "prefer_first",
+            "reasoning": "Security takes priority over complexity",
+            "merged_suggestion": null,
+            "confidence": 0.85
+        }"""
+        mock_llm = MockLLMClient(responses=[mock_response])
+        synthesizer = ConflictSynthesizer(mock_llm)
+
+        f1 = make_finding(AgentName.SECURITY, severity=Severity.HIGH)
+        f2 = make_finding(AgentName.COMPLEXITY, severity=Severity.MEDIUM)
+        conflict = Conflict(
+            id="test-conflict",
+            finding_ids=[f1.id, f2.id],
+            nature=ConflictNature.TRADE_OFF,
+            description="Test conflict",
+            severity_weight=0.8,
+        )
+
+        resolution = await synthesizer.synthesize(conflict, [f1, f2])
+
+        assert resolution.decision == "prefer_first"
+        assert resolution.confidence == 0.85
+        assert "security" in resolution.reasoning.lower()
+
+    @pytest.mark.asyncio
+    async def test_synthesize_handles_invalid_json(self) -> None:
+        mock_llm = MockLLMClient(responses=["not valid json"])
+        synthesizer = ConflictSynthesizer(mock_llm)
+
+        f1 = make_finding(AgentName.SECURITY, severity=Severity.HIGH)
+        f2 = make_finding(AgentName.COMPLEXITY, severity=Severity.LOW)
+        conflict = Conflict(
+            id="test-conflict",
+            finding_ids=[f1.id, f2.id],
+            nature=ConflictNature.TRADE_OFF,
+            description="Test conflict",
+            severity_weight=0.8,
+        )
+
+        resolution = await synthesizer.synthesize(conflict, [f1, f2])
+
+        # Should fall back to severity-based resolution
+        assert resolution.decision == "prefer_first"
+        assert "fallback" in resolution.reasoning.lower()
+
+    def test_should_synthesize_contradictory(self) -> None:
+        synthesizer = ConflictSynthesizer(MockLLMClient())
+        conflict = Conflict(
+            id="test",
+            finding_ids=["a", "b"],
+            nature=ConflictNature.CONTRADICTORY,
+            description="Test",
+            severity_weight=0.5,
+        )
+
+        assert synthesizer.should_synthesize(conflict) is True
+
+    def test_should_not_synthesize_overlapping(self) -> None:
+        synthesizer = ConflictSynthesizer(MockLLMClient())
+        conflict = Conflict(
+            id="test",
+            finding_ids=["a", "b"],
+            nature=ConflictNature.OVERLAPPING,
+            description="Test",
+            severity_weight=0.5,
+        )
+
+        assert synthesizer.should_synthesize(conflict) is False
+
+
+class TestCoordinator:
+    @pytest.mark.asyncio
+    async def test_deliberate_empty_results(self) -> None:
+        coordinator = Coordinator()
+        result = await coordinator.deliberate([], None)
+
+        assert result.verdict == Verdict.APPROVE
+        assert result.total_findings == 0
+        assert len(result.steps) > 0
+
+    @pytest.mark.asyncio
+    async def test_deliberate_merges_findings(self) -> None:
+        coordinator = Coordinator()
+
+        results = [
+            ReviewResult(
+                agent_name=AgentName.SECURITY,
+                findings=[make_finding(AgentName.SECURITY)],
+                duration_ms=100,
+                tokens_used=1000,
+                cost_usd=0.01,
+            ),
+            ReviewResult(
+                agent_name=AgentName.STYLE,
+                findings=[make_finding(AgentName.STYLE, line_start=50)],
+                duration_ms=100,
+                tokens_used=1000,
+                cost_usd=0.01,
+            ),
+        ]
+
+        result = await coordinator.deliberate(results)
+
+        assert result.total_findings == 2
+        assert len(result.merged.groups) == 2
+        assert any(s.step_type == StepType.MERGE for s in result.steps)
+
+    @pytest.mark.asyncio
+    async def test_deliberate_detects_conflicts(self) -> None:
+        coordinator = Coordinator()
+
+        # Create findings at same location from different agents with different titles
+        results = [
+            ReviewResult(
+                agent_name=AgentName.SECURITY,
+                findings=[
+                    make_finding(
+                        AgentName.SECURITY, severity=Severity.HIGH, title="SQL injection risk"
+                    )
+                ],
+                duration_ms=100,
+                tokens_used=1000,
+                cost_usd=0.01,
+            ),
+            ReviewResult(
+                agent_name=AgentName.COMPLEXITY,
+                findings=[
+                    make_finding(
+                        AgentName.COMPLEXITY,
+                        severity=Severity.MEDIUM,
+                        title="Overly complex function",
+                    )
+                ],
+                duration_ms=100,
+                tokens_used=1000,
+                cost_usd=0.01,
+            ),
+        ]
+
+        result = await coordinator.deliberate(results)
+
+        assert len(result.conflicts) > 0
+        assert any(s.step_type == StepType.CONFLICT_DETECTION for s in result.steps)
+
+    @pytest.mark.asyncio
+    async def test_verdict_critical_requests_changes(self) -> None:
+        coordinator = Coordinator()
+
+        results = [
+            ReviewResult(
+                agent_name=AgentName.SECURITY,
+                findings=[make_finding(AgentName.SECURITY, severity=Severity.CRITICAL)],
+                duration_ms=100,
+                tokens_used=1000,
+                cost_usd=0.01,
+            ),
+        ]
+
+        result = await coordinator.deliberate(results)
+
+        assert result.verdict == Verdict.REQUEST_CHANGES
+        assert result.critical_count == 1
+
+    @pytest.mark.asyncio
+    async def test_verdict_multiple_high_requests_changes(self) -> None:
+        coordinator = Coordinator()
+
+        results = [
+            ReviewResult(
+                agent_name=AgentName.SECURITY,
+                findings=[
+                    make_finding(AgentName.SECURITY, severity=Severity.HIGH, line_start=10),
+                    make_finding(AgentName.SECURITY, severity=Severity.HIGH, line_start=20),
+                    make_finding(AgentName.SECURITY, severity=Severity.HIGH, line_start=30),
+                ],
+                duration_ms=100,
+                tokens_used=1000,
+                cost_usd=0.01,
+            ),
+        ]
+
+        result = await coordinator.deliberate(results)
+
+        assert result.verdict == Verdict.REQUEST_CHANGES
+        assert result.high_count == 3
+
+    @pytest.mark.asyncio
+    async def test_verdict_low_severity_approves(self) -> None:
+        coordinator = Coordinator()
+
+        results = [
+            ReviewResult(
+                agent_name=AgentName.STYLE,
+                findings=[
+                    make_finding(AgentName.STYLE, severity=Severity.LOW, line_start=10),
+                    make_finding(AgentName.STYLE, severity=Severity.INFO, line_start=20),
+                ],
+                duration_ms=100,
+                tokens_used=1000,
+                cost_usd=0.01,
+            ),
+        ]
+
+        result = await coordinator.deliberate(results)
+
+        assert result.verdict == Verdict.APPROVE
+
+    @pytest.mark.asyncio
+    async def test_deliberation_steps_logged(self) -> None:
+        coordinator = Coordinator()
+
+        results = [
+            ReviewResult(
+                agent_name=AgentName.SECURITY,
+                findings=[make_finding(AgentName.SECURITY)],
+                duration_ms=100,
+                tokens_used=1000,
+                cost_usd=0.01,
+            ),
+        ]
+
+        result = await coordinator.deliberate(results)
+
+        step_types = [s.step_type for s in result.steps]
+        assert StepType.MERGE in step_types
+        assert StepType.CONFLICT_DETECTION in step_types
+        assert StepType.VERDICT in step_types
+
+    @pytest.mark.asyncio
+    async def test_verdict_medium_count_comments(self) -> None:
+        coordinator = Coordinator()
+
+        results = [
+            ReviewResult(
+                agent_name=AgentName.STYLE,
+                findings=[
+                    make_finding(
+                        AgentName.STYLE,
+                        severity=Severity.MEDIUM,
+                        line_start=(i + 1) * 10,
+                        title=f"Issue {i}",
+                    )
+                    for i in range(5)
+                ],
+                duration_ms=100,
+                tokens_used=1000,
+                cost_usd=0.01,
+            ),
+        ]
+
+        result = await coordinator.deliberate(results)
+
+        assert result.verdict == Verdict.COMMENT
+        assert "medium" in result.verdict_reasoning.lower()
+
+    @pytest.mark.asyncio
+    async def test_verdict_single_high_comments(self) -> None:
+        coordinator = Coordinator()
+
+        results = [
+            ReviewResult(
+                agent_name=AgentName.SECURITY,
+                findings=[
+                    make_finding(AgentName.SECURITY, severity=Severity.HIGH),
+                ],
+                duration_ms=100,
+                tokens_used=1000,
+                cost_usd=0.01,
+            ),
+        ]
+
+        result = await coordinator.deliberate(results)
+
+        assert result.verdict == Verdict.COMMENT
+        assert result.high_count == 1
+
+    @pytest.mark.asyncio
+    async def test_deliberate_with_synthesis(self) -> None:
+        mock_response = """{
+            "decision": "prefer_first",
+            "reasoning": "Security takes priority",
+            "merged_suggestion": null,
+            "confidence": 0.85
+        }"""
+        mock_llm = MockLLMClient(responses=[mock_response])
+        coordinator = Coordinator(llm_client=mock_llm)
+
+        # Create findings at same location from different agents
+        results = [
+            ReviewResult(
+                agent_name=AgentName.SECURITY,
+                findings=[
+                    make_finding(
+                        AgentName.SECURITY,
+                        severity=Severity.HIGH,
+                        title="Security vulnerability",
+                        suggestion="Add validation",
+                    )
+                ],
+                duration_ms=100,
+                tokens_used=1000,
+                cost_usd=0.01,
+            ),
+            ReviewResult(
+                agent_name=AgentName.COMPLEXITY,
+                findings=[
+                    make_finding(
+                        AgentName.COMPLEXITY,
+                        severity=Severity.MEDIUM,
+                        title="Complex function",
+                        suggestion="Remove validation",
+                    )
+                ],
+                duration_ms=100,
+                tokens_used=1000,
+                cost_usd=0.01,
+            ),
+        ]
+
+        result = await coordinator.deliberate(results)
+
+        assert len(result.conflicts) > 0
+        # Synthesis step should be logged
+        assert any(s.step_type == StepType.SYNTHESIS for s in result.steps)
+
+
+class TestConflictDetectorEdgeCases:
+    def test_no_conflicts_with_no_overlap(self) -> None:
+        detector = ConflictDetector()
+        f1 = make_finding(AgentName.SECURITY, line_start=10, line_end=15)
+        f2 = make_finding(AgentName.STYLE, line_start=100, line_end=105)
+
+        conflicts = detector.detect_conflicts([f1, f2])
+        assert len(conflicts) == 0
+
+    def test_overlap_no_title_match(self) -> None:
+        detector = ConflictDetector()
+        # These agents are in TRADE_OFF_PAIRS, so will be detected as trade-off
+        f1 = make_finding(
+            AgentName.SECURITY,
+            title="Unique security title",
+        )
+        f2 = make_finding(
+            AgentName.STYLE,
+            title="Completely different style concern",
+        )
+
+        conflicts = detector.detect_conflicts([f1, f2])
+        assert len(conflicts) == 1
+        # Security/Style is a trade-off pair
+        assert conflicts[0].nature == ConflictNature.TRADE_OFF
+
+    def test_resolve_empty_findings(self) -> None:
+        detector = ConflictDetector()
+        conflict = Conflict(
+            id="test",
+            finding_ids=["nonexistent1", "nonexistent2"],
+            nature=ConflictNature.TRADE_OFF,
+            description="Test",
+            severity_weight=0.5,
+        )
+
+        resolved = detector.resolve_by_severity(conflict, [])
+        assert resolved.winning_finding_id is None
+
+
+class TestConflictSynthesizerEdgeCases:
+    @pytest.mark.asyncio
+    async def test_synthesize_missing_findings(self) -> None:
+        mock_llm = MockLLMClient()
+        synthesizer = ConflictSynthesizer(mock_llm)
+
+        conflict = Conflict(
+            id="test",
+            finding_ids=["nonexistent1", "nonexistent2"],
+            nature=ConflictNature.CONTRADICTORY,
+            description="Test",
+            severity_weight=0.8,
+        )
+
+        resolution = await synthesizer.synthesize(conflict, [])
+
+        assert resolution.decision == "keep_both"
+        assert "Could not find" in resolution.reasoning
+
+    def test_synthesize_low_severity(self) -> None:
+        synthesizer = ConflictSynthesizer(MockLLMClient())
+        conflict = Conflict(
+            id="test",
+            finding_ids=["a", "b"],
+            nature=ConflictNature.TRADE_OFF,
+            description="Test",
+            severity_weight=0.5,  # Below 0.7 threshold
+        )
+
+        assert synthesizer.should_synthesize(conflict) is False
+
+    def test_synthesize_high_severity(self) -> None:
+        synthesizer = ConflictSynthesizer(MockLLMClient())
+        conflict = Conflict(
+            id="test",
+            finding_ids=["a", "b"],
+            nature=ConflictNature.TRADE_OFF,
+            description="Test",
+            severity_weight=0.8,  # Above 0.7 threshold
+        )
+
+        assert synthesizer.should_synthesize(conflict) is True
+
+    @pytest.mark.asyncio
+    async def test_synthesize_fallback_prefer_second(self) -> None:
+        mock_llm = MockLLMClient(responses=["not valid json"])
+        synthesizer = ConflictSynthesizer(mock_llm)
+
+        f1 = make_finding(AgentName.STYLE, severity=Severity.LOW)
+        f2 = make_finding(AgentName.SECURITY, severity=Severity.HIGH)
+        conflict = Conflict(
+            id="test-conflict",
+            finding_ids=[f1.id, f2.id],
+            nature=ConflictNature.CONTRADICTORY,
+            description="Test conflict",
+            severity_weight=0.8,
+        )
+
+        resolution = await synthesizer.synthesize(conflict, [f1, f2])
+
+        assert resolution.decision == "prefer_second"
+        assert "fallback" in resolution.reasoning.lower()
+
+    @pytest.mark.asyncio
+    async def test_synthesize_fallback_equal_severity(self) -> None:
+        mock_llm = MockLLMClient(responses=["not valid json"])
+        synthesizer = ConflictSynthesizer(mock_llm)
+
+        f1 = make_finding(AgentName.STYLE, severity=Severity.MEDIUM)
+        f2 = make_finding(AgentName.SECURITY, severity=Severity.MEDIUM)
+        conflict = Conflict(
+            id="test-conflict",
+            finding_ids=[f1.id, f2.id],
+            nature=ConflictNature.CONTRADICTORY,
+            description="Test conflict",
+            severity_weight=0.8,
+        )
+
+        resolution = await synthesizer.synthesize(conflict, [f1, f2])
+
+        assert resolution.decision == "keep_both"
+        assert "equal severity" in resolution.reasoning.lower()
+
+    @pytest.mark.asyncio
+    async def test_synthesize_parse_json_in_code_block(self) -> None:
+        mock_response = """Here is my analysis:
+```json
+{
+    "decision": "merge",
+    "reasoning": "Both concerns valid",
+    "merged_suggestion": "Do both things",
+    "confidence": 0.9
+}
+```
+"""
+        mock_llm = MockLLMClient(responses=[mock_response])
+        synthesizer = ConflictSynthesizer(mock_llm)
+
+        f1 = make_finding(AgentName.SECURITY)
+        f2 = make_finding(AgentName.COMPLEXITY)
+        conflict = Conflict(
+            id="test-conflict",
+            finding_ids=[f1.id, f2.id],
+            nature=ConflictNature.CONTRADICTORY,
+            description="Test",
+            severity_weight=0.8,
+        )
+
+        resolution = await synthesizer.synthesize(conflict, [f1, f2])
+
+        assert resolution.decision == "merge"
+        assert resolution.merged_suggestion == "Do both things"
+
+    @pytest.mark.asyncio
+    async def test_synthesize_parse_plain_json(self) -> None:
+        mock_response = """{
+            "decision": "prefer_second",
+            "reasoning": "Second is better",
+            "confidence": 0.75
+        }"""
+        mock_llm = MockLLMClient(responses=[mock_response])
+        synthesizer = ConflictSynthesizer(mock_llm)
+
+        f1 = make_finding(AgentName.SECURITY)
+        f2 = make_finding(AgentName.COMPLEXITY)
+        conflict = Conflict(
+            id="test-conflict",
+            finding_ids=[f1.id, f2.id],
+            nature=ConflictNature.CONTRADICTORY,
+            description="Test",
+            severity_weight=0.8,
+        )
+
+        resolution = await synthesizer.synthesize(conflict, [f1, f2])
+
+        assert resolution.decision == "prefer_second"
+        assert resolution.confidence == 0.75
+
+
+class TestFindingMergerEdgeCases:
+    def test_merge_different_files(self) -> None:
+        merger = FindingMerger()
+        f1 = make_finding(AgentName.SECURITY, file="a.py", line_start=10)
+        f2 = make_finding(AgentName.SECURITY, file="b.py", line_start=10)
+
+        result = merger.merge([f1, f2], None)
+
+        assert len(result.groups) == 2
+        assert len(result.unique_findings) == 2
+
+    def test_finding_group_empty(self) -> None:
+        group = FindingGroup(
+            file="test.py",
+            line_start=10,
+            line_end=20,
+            findings=[],
+        )
+
+        assert group.primary_finding is None
+        assert group.agents == []