add static analysis + deliberation pipeline
This commit is contained in:
24
src/arbiter/analysis/__init__.py
Normal file
24
src/arbiter/analysis/__init__.py
Normal file
@@ -0,0 +1,24 @@
|
||||
"""Static analysis and diff parsing for Arbiter."""
|
||||
|
||||
from arbiter.analysis.diff import DiffFile, DiffHunk, DiffLine, DiffParser, LineType, ParsedDiff
|
||||
from arbiter.analysis.static import (
|
||||
StaticAnalysisConfig,
|
||||
StaticAnalysisResult,
|
||||
StaticAnalysisRunner,
|
||||
StaticFinding,
|
||||
run_static_analysis,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"DiffFile",
|
||||
"DiffHunk",
|
||||
"DiffLine",
|
||||
"DiffParser",
|
||||
"LineType",
|
||||
"ParsedDiff",
|
||||
"StaticAnalysisConfig",
|
||||
"StaticAnalysisResult",
|
||||
"StaticAnalysisRunner",
|
||||
"StaticFinding",
|
||||
"run_static_analysis",
|
||||
]
|
||||
283
src/arbiter/analysis/diff.py
Normal file
283
src/arbiter/analysis/diff.py
Normal file
@@ -0,0 +1,283 @@
|
||||
"""Unified diff parser with line mapping."""
|
||||
|
||||
import re
|
||||
from enum import StrEnum
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class LineType(StrEnum):
|
||||
"""Type of line in a diff."""
|
||||
|
||||
ADDED = "added"
|
||||
REMOVED = "removed"
|
||||
CONTEXT = "context"
|
||||
|
||||
|
||||
class DiffLine(BaseModel):
|
||||
"""A single line in a diff hunk."""
|
||||
|
||||
content: str = Field(description="The line content without prefix")
|
||||
line_type: LineType = Field(description="Type of line (added/removed/context)")
|
||||
old_line: int | None = Field(default=None, description="Line number in original file")
|
||||
new_line: int | None = Field(default=None, description="Line number in new file")
|
||||
|
||||
|
||||
class DiffHunk(BaseModel):
|
||||
"""A hunk within a diff file."""
|
||||
|
||||
old_start: int = Field(description="Starting line in original file")
|
||||
old_count: int = Field(description="Number of lines from original file")
|
||||
new_start: int = Field(description="Starting line in new file")
|
||||
new_count: int = Field(description="Number of lines in new file")
|
||||
header: str = Field(default="", description="Optional function/class context from @@ line")
|
||||
lines: list[DiffLine] = Field(default_factory=list, description="Lines in this hunk")
|
||||
|
||||
def get_added_lines(self) -> list[DiffLine]:
|
||||
return [line for line in self.lines if line.line_type == LineType.ADDED]
|
||||
|
||||
def get_removed_lines(self) -> list[DiffLine]:
|
||||
return [line for line in self.lines if line.line_type == LineType.REMOVED]
|
||||
|
||||
|
||||
class DiffFile(BaseModel):
|
||||
"""A single file's diff."""
|
||||
|
||||
old_path: str = Field(description="Original file path (a/...)")
|
||||
new_path: str = Field(description="New file path (b/...)")
|
||||
hunks: list[DiffHunk] = Field(default_factory=list, description="Hunks in this file")
|
||||
is_new: bool = Field(default=False, description="True if file is newly created")
|
||||
is_deleted: bool = Field(default=False, description="True if file is deleted")
|
||||
is_binary: bool = Field(default=False, description="True if binary file")
|
||||
|
||||
@property
|
||||
def path(self) -> str:
|
||||
if self.is_deleted:
|
||||
return self.old_path
|
||||
return self.new_path
|
||||
|
||||
def get_added_line_numbers(self) -> list[int]:
|
||||
numbers = []
|
||||
for hunk in self.hunks:
|
||||
for line in hunk.lines:
|
||||
if line.line_type == LineType.ADDED and line.new_line is not None:
|
||||
numbers.append(line.new_line)
|
||||
return numbers
|
||||
|
||||
def get_changed_line_range(self) -> tuple[int, int] | None:
|
||||
added_lines = self.get_added_line_numbers()
|
||||
if not added_lines:
|
||||
return None
|
||||
return (min(added_lines), max(added_lines))
|
||||
|
||||
def line_in_diff(self, line_number: int) -> bool:
|
||||
for hunk in self.hunks:
|
||||
if hunk.new_start <= line_number < hunk.new_start + hunk.new_count:
|
||||
return True
|
||||
return False
|
||||
|
||||
def map_new_to_old(self, new_line: int) -> int | None:
|
||||
for hunk in self.hunks:
|
||||
for line in hunk.lines:
|
||||
if line.new_line == new_line and line.old_line is not None:
|
||||
return line.old_line
|
||||
return None
|
||||
|
||||
|
||||
class ParsedDiff(BaseModel):
|
||||
"""Complete parsed diff containing multiple files."""
|
||||
|
||||
files: list[DiffFile] = Field(default_factory=list, description="Files in this diff")
|
||||
raw_diff: str = Field(default="", description="Original diff text")
|
||||
|
||||
def get_file(self, path: str) -> DiffFile | None:
|
||||
for file in self.files:
|
||||
if file.path == path or file.old_path == path or file.new_path == path:
|
||||
return file
|
||||
return None
|
||||
|
||||
def get_changed_files(self) -> list[str]:
|
||||
return [f.path for f in self.files]
|
||||
|
||||
def get_added_files(self) -> list[str]:
|
||||
return [f.path for f in self.files if f.is_new]
|
||||
|
||||
def get_deleted_files(self) -> list[str]:
|
||||
return [f.path for f in self.files if f.is_deleted]
|
||||
|
||||
|
||||
class DiffParser:
|
||||
"""Parser for unified diff format."""
|
||||
|
||||
# Regex patterns for diff parsing
|
||||
FILE_HEADER_PATTERN = re.compile(r"^diff --git a/(.*) b/(.*)$")
|
||||
OLD_FILE_PATTERN = re.compile(r"^--- (?:a/)?(.*)$")
|
||||
NEW_FILE_PATTERN = re.compile(r"^\+\+\+ (?:b/)?(.*)$")
|
||||
HUNK_HEADER_PATTERN = re.compile(r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*)$")
|
||||
BINARY_FILE_PATTERN = re.compile(r"^Binary files .* differ$")
|
||||
NEW_FILE_MODE_PATTERN = re.compile(r"^new file mode")
|
||||
DELETED_FILE_MODE_PATTERN = re.compile(r"^deleted file mode")
|
||||
|
||||
def parse(self, diff_text: str) -> ParsedDiff:
|
||||
"""Parse a unified diff string into structured data."""
|
||||
if not diff_text or not diff_text.strip():
|
||||
return ParsedDiff(raw_diff=diff_text)
|
||||
|
||||
files: list[DiffFile] = []
|
||||
current_file: DiffFile | None = None
|
||||
current_hunk: DiffHunk | None = None
|
||||
|
||||
lines = diff_text.splitlines()
|
||||
i = 0
|
||||
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
|
||||
# Check for new file header (diff --git)
|
||||
file_match = self.FILE_HEADER_PATTERN.match(line)
|
||||
if file_match:
|
||||
# Save previous file if exists
|
||||
if current_file is not None:
|
||||
if current_hunk is not None:
|
||||
current_file.hunks.append(current_hunk)
|
||||
files.append(current_file)
|
||||
|
||||
current_file = DiffFile(
|
||||
old_path=file_match.group(1),
|
||||
new_path=file_match.group(2),
|
||||
)
|
||||
current_hunk = None
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Check for new/deleted file mode
|
||||
if current_file is not None:
|
||||
if self.NEW_FILE_MODE_PATTERN.match(line):
|
||||
current_file.is_new = True
|
||||
i += 1
|
||||
continue
|
||||
if self.DELETED_FILE_MODE_PATTERN.match(line):
|
||||
current_file.is_deleted = True
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Check for binary file
|
||||
if current_file is not None and self.BINARY_FILE_PATTERN.match(line):
|
||||
current_file.is_binary = True
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Check for old file path (--- a/...)
|
||||
old_match = self.OLD_FILE_PATTERN.match(line)
|
||||
if old_match:
|
||||
if current_file is not None:
|
||||
# Update old_path if it's different
|
||||
path = old_match.group(1)
|
||||
if path != "/dev/null":
|
||||
current_file.old_path = path
|
||||
else:
|
||||
current_file.is_new = True
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Check for new file path (+++ b/...)
|
||||
new_match = self.NEW_FILE_PATTERN.match(line)
|
||||
if new_match:
|
||||
if current_file is not None:
|
||||
path = new_match.group(1)
|
||||
if path != "/dev/null":
|
||||
current_file.new_path = path
|
||||
else:
|
||||
current_file.is_deleted = True
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Check for hunk header (@@ ... @@)
|
||||
hunk_match = self.HUNK_HEADER_PATTERN.match(line)
|
||||
if hunk_match:
|
||||
# Save previous hunk if exists
|
||||
if current_file is not None and current_hunk is not None:
|
||||
current_file.hunks.append(current_hunk)
|
||||
|
||||
old_start = int(hunk_match.group(1))
|
||||
old_count = int(hunk_match.group(2)) if hunk_match.group(2) else 1
|
||||
new_start = int(hunk_match.group(3))
|
||||
new_count = int(hunk_match.group(4)) if hunk_match.group(4) else 1
|
||||
header = hunk_match.group(5).strip()
|
||||
|
||||
current_hunk = DiffHunk(
|
||||
old_start=old_start,
|
||||
old_count=old_count,
|
||||
new_start=new_start,
|
||||
new_count=new_count,
|
||||
header=header,
|
||||
)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Parse hunk lines
|
||||
if current_hunk is not None and line:
|
||||
diff_line = self._parse_hunk_line(line, current_hunk)
|
||||
if diff_line is not None:
|
||||
current_hunk.lines.append(diff_line)
|
||||
|
||||
i += 1
|
||||
|
||||
# Don't forget the last file and hunk
|
||||
if current_file is not None:
|
||||
if current_hunk is not None:
|
||||
current_file.hunks.append(current_hunk)
|
||||
files.append(current_file)
|
||||
|
||||
return ParsedDiff(files=files, raw_diff=diff_text)
|
||||
|
||||
def _parse_hunk_line(self, line: str, hunk: DiffHunk) -> DiffLine | None:
|
||||
"""Parse a single line within a hunk."""
|
||||
if not line:
|
||||
return None
|
||||
|
||||
prefix = line[0] if line else " "
|
||||
content = line[1:] if len(line) > 1 else ""
|
||||
|
||||
# Calculate current line numbers based on existing lines
|
||||
old_line = hunk.old_start
|
||||
new_line = hunk.new_start
|
||||
|
||||
for existing in hunk.lines:
|
||||
if existing.line_type in (LineType.CONTEXT, LineType.REMOVED):
|
||||
old_line += 1
|
||||
if existing.line_type in (LineType.CONTEXT, LineType.ADDED):
|
||||
new_line += 1
|
||||
|
||||
if prefix == "+":
|
||||
return DiffLine(
|
||||
content=content,
|
||||
line_type=LineType.ADDED,
|
||||
old_line=None,
|
||||
new_line=new_line,
|
||||
)
|
||||
elif prefix == "-":
|
||||
return DiffLine(
|
||||
content=content,
|
||||
line_type=LineType.REMOVED,
|
||||
old_line=old_line,
|
||||
new_line=None,
|
||||
)
|
||||
elif prefix == " ":
|
||||
return DiffLine(
|
||||
content=content,
|
||||
line_type=LineType.CONTEXT,
|
||||
old_line=old_line,
|
||||
new_line=new_line,
|
||||
)
|
||||
elif prefix == "\\":
|
||||
# "\ No newline at end of file" - skip
|
||||
return None
|
||||
else:
|
||||
# Unknown prefix, treat as context
|
||||
return DiffLine(
|
||||
content=line,
|
||||
line_type=LineType.CONTEXT,
|
||||
old_line=old_line,
|
||||
new_line=new_line,
|
||||
)
|
||||
519
src/arbiter/analysis/static.py
Normal file
519
src/arbiter/analysis/static.py
Normal file
@@ -0,0 +1,519 @@
|
||||
"""Static analysis runners for ruff, mypy, bandit, and radon."""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import ClassVar
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from arbiter.analysis.diff import DiffFile, ParsedDiff
|
||||
from arbiter.models.enums import AgentName, Severity
|
||||
from arbiter.models.finding import Finding
|
||||
|
||||
|
||||
class StaticAnalysisConfig(BaseModel):
|
||||
"""Configuration for static analysis tools."""
|
||||
|
||||
ruff_enabled: bool = Field(default=True, description="Run ruff linter")
|
||||
mypy_enabled: bool = Field(default=True, description="Run mypy type checker")
|
||||
bandit_enabled: bool = Field(default=True, description="Run bandit security scanner")
|
||||
radon_enabled: bool = Field(default=True, description="Run radon complexity analyzer")
|
||||
ruff_config: str | None = Field(default=None, description="Path to ruff config file")
|
||||
mypy_config: str | None = Field(default=None, description="Path to mypy config file")
|
||||
|
||||
|
||||
class StaticFinding(BaseModel):
|
||||
"""A finding from static analysis tools."""
|
||||
|
||||
tool: str = Field(description="Tool that produced this finding (ruff, mypy, bandit, radon)")
|
||||
file: str = Field(description="File path")
|
||||
line: int = Field(description="Line number")
|
||||
column: int | None = Field(default=None, description="Column number")
|
||||
code: str = Field(description="Error/warning code (e.g., E501, B303)")
|
||||
message: str = Field(description="Description of the issue")
|
||||
severity: Severity = Field(description="Mapped severity level")
|
||||
end_line: int | None = Field(default=None, description="End line for multi-line issues")
|
||||
end_column: int | None = Field(default=None, description="End column")
|
||||
fix_available: bool = Field(default=False, description="Whether an auto-fix is available")
|
||||
extra: dict[str, str | int | None] | None = Field(
|
||||
default=None, description="Tool-specific extra data"
|
||||
)
|
||||
|
||||
|
||||
class StaticAnalysisResult(BaseModel):
|
||||
"""Aggregated results from all static analysis tools."""
|
||||
|
||||
findings: list[StaticFinding] = Field(default_factory=list)
|
||||
tool_errors: dict[str, str] = Field(
|
||||
default_factory=dict, description="Errors from tools that failed"
|
||||
)
|
||||
tools_run: list[str] = Field(default_factory=list, description="Tools that ran successfully")
|
||||
|
||||
|
||||
class StaticAnalysisRunner:
|
||||
"""Runs static analysis tools on diff files."""
|
||||
|
||||
# Severity mapping for ruff codes
|
||||
RUFF_SEVERITY_MAP: ClassVar[dict[str, Severity]] = {
|
||||
# Security-related (B = bandit-like checks in ruff)
|
||||
"S": Severity.HIGH, # Security issues
|
||||
# Errors
|
||||
"E": Severity.MEDIUM,
|
||||
"F": Severity.MEDIUM, # Pyflakes
|
||||
"W": Severity.LOW, # Warnings
|
||||
# Style
|
||||
"C": Severity.LOW, # McCabe complexity, conventions
|
||||
"N": Severity.LOW, # pep8-naming
|
||||
"D": Severity.INFO, # pydocstyle
|
||||
"I": Severity.INFO, # isort
|
||||
# Type annotations
|
||||
"ANN": Severity.LOW,
|
||||
# Builtins
|
||||
"A": Severity.MEDIUM,
|
||||
# Bugbear
|
||||
"B": Severity.MEDIUM,
|
||||
}
|
||||
|
||||
# Bandit severity mapping
|
||||
BANDIT_SEVERITY_MAP: ClassVar[dict[str, Severity]] = {
|
||||
"HIGH": Severity.HIGH,
|
||||
"MEDIUM": Severity.MEDIUM,
|
||||
"LOW": Severity.LOW,
|
||||
}
|
||||
|
||||
# Radon complexity thresholds (cyclomatic complexity)
|
||||
RADON_THRESHOLDS: ClassVar[dict[str, Severity]] = {
|
||||
"F": Severity.CRITICAL, # Very high risk (>40)
|
||||
"E": Severity.HIGH, # High risk (31-40)
|
||||
"D": Severity.HIGH, # More than moderate (21-30)
|
||||
"C": Severity.MEDIUM, # Moderate (11-20)
|
||||
"B": Severity.LOW, # Low (6-10)
|
||||
"A": Severity.INFO, # Simple (1-5)
|
||||
}
|
||||
|
||||
def __init__(self, config: StaticAnalysisConfig | None = None) -> None:
|
||||
self.config = config or StaticAnalysisConfig()
|
||||
|
||||
async def run(self, diff: ParsedDiff, work_dir: Path | None = None) -> StaticAnalysisResult:
|
||||
"""Run all enabled static analysis tools on the diff."""
|
||||
result = StaticAnalysisResult()
|
||||
|
||||
# If no work_dir provided, we can't run analysis on actual files
|
||||
# In that case, return empty result
|
||||
if work_dir is None:
|
||||
return result
|
||||
|
||||
# Collect Python files from the diff
|
||||
python_files = [
|
||||
f for f in diff.files if f.path.endswith(".py") and not f.is_deleted and not f.is_binary
|
||||
]
|
||||
|
||||
if not python_files:
|
||||
return result
|
||||
|
||||
# Run tools in parallel
|
||||
tasks = []
|
||||
if self.config.ruff_enabled:
|
||||
tasks.append(self._run_ruff(python_files, work_dir, diff))
|
||||
if self.config.mypy_enabled:
|
||||
tasks.append(self._run_mypy(python_files, work_dir, diff))
|
||||
if self.config.bandit_enabled:
|
||||
tasks.append(self._run_bandit(python_files, work_dir, diff))
|
||||
if self.config.radon_enabled:
|
||||
tasks.append(self._run_radon(python_files, work_dir, diff))
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
tool_names = []
|
||||
if self.config.ruff_enabled:
|
||||
tool_names.append("ruff")
|
||||
if self.config.mypy_enabled:
|
||||
tool_names.append("mypy")
|
||||
if self.config.bandit_enabled:
|
||||
tool_names.append("bandit")
|
||||
if self.config.radon_enabled:
|
||||
tool_names.append("radon")
|
||||
|
||||
for tool_name, tool_result in zip(tool_names, results, strict=False):
|
||||
if isinstance(tool_result, BaseException):
|
||||
result.tool_errors[tool_name] = str(tool_result)
|
||||
elif isinstance(tool_result, list):
|
||||
result.findings.extend(tool_result)
|
||||
result.tools_run.append(tool_name)
|
||||
|
||||
return result
|
||||
|
||||
async def _run_ruff(
|
||||
self, files: list[DiffFile], work_dir: Path, diff: ParsedDiff
|
||||
) -> list[StaticFinding]:
|
||||
"""Run ruff linter and parse JSON output."""
|
||||
file_paths = [str(work_dir / f.path) for f in files if (work_dir / f.path).exists()]
|
||||
if not file_paths:
|
||||
return []
|
||||
|
||||
cmd = ["ruff", "check", "--output-format=json"]
|
||||
if self.config.ruff_config:
|
||||
cmd.extend(["--config", self.config.ruff_config])
|
||||
cmd.extend(file_paths)
|
||||
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
cwd=work_dir,
|
||||
)
|
||||
stdout, _ = await proc.communicate()
|
||||
|
||||
# ruff returns non-zero if issues found, which is expected
|
||||
if stdout:
|
||||
return self._parse_ruff_output(stdout.decode(), diff, work_dir)
|
||||
return []
|
||||
except FileNotFoundError:
|
||||
raise RuntimeError("ruff not found. Install with: pip install ruff") from None
|
||||
|
||||
def _parse_ruff_output(
|
||||
self, output: str, diff: ParsedDiff, work_dir: Path
|
||||
) -> list[StaticFinding]:
|
||||
"""Parse ruff JSON output into StaticFindings."""
|
||||
if not output.strip():
|
||||
return []
|
||||
|
||||
try:
|
||||
issues = json.loads(output)
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
|
||||
findings = []
|
||||
for issue in issues:
|
||||
file_path = issue.get("filename", "")
|
||||
# Make path relative to work_dir
|
||||
try:
|
||||
rel_path = Path(file_path).relative_to(work_dir)
|
||||
file_path = str(rel_path)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
line = issue.get("location", {}).get("row", 1)
|
||||
|
||||
# Only include findings that are in the diff
|
||||
diff_file = diff.get_file(file_path)
|
||||
if diff_file is None or not diff_file.line_in_diff(line):
|
||||
continue
|
||||
|
||||
code = issue.get("code", "")
|
||||
severity = self._ruff_code_to_severity(code)
|
||||
|
||||
findings.append(
|
||||
StaticFinding(
|
||||
tool="ruff",
|
||||
file=file_path,
|
||||
line=line,
|
||||
column=issue.get("location", {}).get("column"),
|
||||
code=code,
|
||||
message=issue.get("message", ""),
|
||||
severity=severity,
|
||||
end_line=issue.get("end_location", {}).get("row"),
|
||||
end_column=issue.get("end_location", {}).get("column"),
|
||||
fix_available=issue.get("fix") is not None,
|
||||
extra={"url": issue.get("url")},
|
||||
)
|
||||
)
|
||||
|
||||
return findings
|
||||
|
||||
def _ruff_code_to_severity(self, code: str) -> Severity:
|
||||
"""Map ruff error code to severity level."""
|
||||
if not code:
|
||||
return Severity.MEDIUM
|
||||
|
||||
# Check prefixes from longest to shortest
|
||||
for prefix in sorted(self.RUFF_SEVERITY_MAP.keys(), key=len, reverse=True):
|
||||
if code.startswith(prefix):
|
||||
return self.RUFF_SEVERITY_MAP[prefix]
|
||||
|
||||
return Severity.MEDIUM
|
||||
|
||||
async def _run_mypy(
|
||||
self, files: list[DiffFile], work_dir: Path, diff: ParsedDiff
|
||||
) -> list[StaticFinding]:
|
||||
"""Run mypy type checker and parse output."""
|
||||
file_paths = [str(work_dir / f.path) for f in files if (work_dir / f.path).exists()]
|
||||
if not file_paths:
|
||||
return []
|
||||
|
||||
cmd = ["mypy", "--no-error-summary", "--show-column-numbers"]
|
||||
if self.config.mypy_config:
|
||||
cmd.extend(["--config-file", self.config.mypy_config])
|
||||
cmd.extend(file_paths)
|
||||
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
cwd=work_dir,
|
||||
)
|
||||
stdout, _ = await proc.communicate()
|
||||
|
||||
if stdout:
|
||||
return self._parse_mypy_output(stdout.decode(), diff, work_dir)
|
||||
return []
|
||||
except FileNotFoundError:
|
||||
raise RuntimeError("mypy not found. Install with: pip install mypy") from None
|
||||
|
||||
def _parse_mypy_output(
|
||||
self, output: str, diff: ParsedDiff, work_dir: Path
|
||||
) -> list[StaticFinding]:
|
||||
"""Parse mypy output into StaticFindings."""
|
||||
findings = []
|
||||
# mypy output format: file:line:column: severity: message
|
||||
pattern = re.compile(r"^(.+?):(\d+):(\d+): (\w+): (.+)$")
|
||||
|
||||
for line in output.strip().split("\n"):
|
||||
match = pattern.match(line)
|
||||
if not match:
|
||||
continue
|
||||
|
||||
file_path, line_num, col, level, message = match.groups()
|
||||
|
||||
# Make path relative
|
||||
try:
|
||||
rel_path = Path(file_path).relative_to(work_dir)
|
||||
file_path = str(rel_path)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
line_num = int(line_num)
|
||||
|
||||
# Only include findings in the diff
|
||||
diff_file = diff.get_file(file_path)
|
||||
if diff_file is None or not diff_file.line_in_diff(line_num):
|
||||
continue
|
||||
|
||||
# Map mypy severity
|
||||
if level == "error":
|
||||
severity = Severity.MEDIUM
|
||||
elif level == "warning":
|
||||
severity = Severity.LOW
|
||||
else:
|
||||
severity = Severity.INFO
|
||||
|
||||
findings.append(
|
||||
StaticFinding(
|
||||
tool="mypy",
|
||||
file=file_path,
|
||||
line=line_num,
|
||||
column=int(col),
|
||||
code=f"mypy-{level}",
|
||||
message=message,
|
||||
severity=severity,
|
||||
)
|
||||
)
|
||||
|
||||
return findings
|
||||
|
||||
async def _run_bandit(
|
||||
self, files: list[DiffFile], work_dir: Path, diff: ParsedDiff
|
||||
) -> list[StaticFinding]:
|
||||
"""Run bandit security scanner and parse JSON output."""
|
||||
file_paths = [str(work_dir / f.path) for f in files if (work_dir / f.path).exists()]
|
||||
if not file_paths:
|
||||
return []
|
||||
|
||||
cmd = ["bandit", "-f", "json", "-q"]
|
||||
cmd.extend(file_paths)
|
||||
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
cwd=work_dir,
|
||||
)
|
||||
stdout, _ = await proc.communicate()
|
||||
|
||||
if stdout:
|
||||
return self._parse_bandit_output(stdout.decode(), diff, work_dir)
|
||||
return []
|
||||
except FileNotFoundError:
|
||||
raise RuntimeError("bandit not found. Install with: pip install bandit") from None
|
||||
|
||||
def _parse_bandit_output(
|
||||
self, output: str, diff: ParsedDiff, work_dir: Path
|
||||
) -> list[StaticFinding]:
|
||||
"""Parse bandit JSON output into StaticFindings."""
|
||||
if not output.strip():
|
||||
return []
|
||||
|
||||
try:
|
||||
data = json.loads(output)
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
|
||||
findings = []
|
||||
for issue in data.get("results", []):
|
||||
file_path = issue.get("filename", "")
|
||||
|
||||
# Make path relative
|
||||
try:
|
||||
rel_path = Path(file_path).relative_to(work_dir)
|
||||
file_path = str(rel_path)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
line = issue.get("line_number", 1)
|
||||
|
||||
# Only include findings in the diff
|
||||
diff_file = diff.get_file(file_path)
|
||||
if diff_file is None or not diff_file.line_in_diff(line):
|
||||
continue
|
||||
|
||||
severity_str = issue.get("issue_severity", "MEDIUM")
|
||||
severity = self.BANDIT_SEVERITY_MAP.get(severity_str, Severity.MEDIUM)
|
||||
|
||||
findings.append(
|
||||
StaticFinding(
|
||||
tool="bandit",
|
||||
file=file_path,
|
||||
line=line,
|
||||
code=issue.get("test_id", ""),
|
||||
message=f"{issue.get('issue_text', '')} (Confidence: {issue.get('issue_confidence', 'MEDIUM')})",
|
||||
severity=severity,
|
||||
end_line=issue.get("line_range", [line])[-1]
|
||||
if issue.get("line_range")
|
||||
else None,
|
||||
extra={
|
||||
"test_name": issue.get("test_name"),
|
||||
"confidence": issue.get("issue_confidence"),
|
||||
"cwe": issue.get("issue_cwe", {}).get("id")
|
||||
if issue.get("issue_cwe")
|
||||
else None,
|
||||
"more_info": issue.get("more_info"),
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
return findings
|
||||
|
||||
async def _run_radon(
|
||||
self, files: list[DiffFile], work_dir: Path, diff: ParsedDiff
|
||||
) -> list[StaticFinding]:
|
||||
"""Run radon complexity analyzer and parse JSON output."""
|
||||
file_paths = [str(work_dir / f.path) for f in files if (work_dir / f.path).exists()]
|
||||
if not file_paths:
|
||||
return []
|
||||
|
||||
cmd = ["radon", "cc", "-j", "-s"]
|
||||
cmd.extend(file_paths)
|
||||
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
cwd=work_dir,
|
||||
)
|
||||
stdout, _ = await proc.communicate()
|
||||
|
||||
if stdout:
|
||||
return self._parse_radon_output(stdout.decode(), diff, work_dir)
|
||||
return []
|
||||
except FileNotFoundError:
|
||||
raise RuntimeError("radon not found. Install with: pip install radon") from None
|
||||
|
||||
def _parse_radon_output(
|
||||
self, output: str, diff: ParsedDiff, work_dir: Path
|
||||
) -> list[StaticFinding]:
|
||||
"""Parse radon JSON output into StaticFindings."""
|
||||
if not output.strip():
|
||||
return []
|
||||
|
||||
try:
|
||||
data = json.loads(output)
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
|
||||
findings = []
|
||||
for file_path, blocks in data.items():
|
||||
# Make path relative
|
||||
try:
|
||||
rel_path = Path(file_path).relative_to(work_dir)
|
||||
file_path = str(rel_path)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
for block in blocks:
|
||||
line = block.get("lineno", 1)
|
||||
|
||||
# Only include findings in the diff
|
||||
diff_file = diff.get_file(file_path)
|
||||
if diff_file is None or not diff_file.line_in_diff(line):
|
||||
continue
|
||||
|
||||
rank = block.get("rank", "A")
|
||||
complexity = block.get("complexity", 0)
|
||||
|
||||
# Only report if complexity is notable (C or worse)
|
||||
if rank in ("A", "B"):
|
||||
continue
|
||||
|
||||
severity = self.RADON_THRESHOLDS.get(rank, Severity.INFO)
|
||||
block_type = block.get("type", "function")
|
||||
name = block.get("name", "unknown")
|
||||
|
||||
findings.append(
|
||||
StaticFinding(
|
||||
tool="radon",
|
||||
file=file_path,
|
||||
line=line,
|
||||
code=f"CC{rank}",
|
||||
message=f"{block_type.capitalize()} '{name}' has cyclomatic complexity of {complexity} (rank {rank})",
|
||||
severity=severity,
|
||||
end_line=block.get("endline"),
|
||||
extra={
|
||||
"complexity": complexity,
|
||||
"rank": rank,
|
||||
"type": block_type,
|
||||
"name": name,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
return findings
|
||||
|
||||
def convert_to_finding(
|
||||
self, static_finding: StaticFinding, prompt_version: str = "static-v1.0"
|
||||
) -> Finding:
|
||||
"""Convert a StaticFinding to the standard Finding model."""
|
||||
return Finding(
|
||||
id=f"{static_finding.tool}-{static_finding.file}-{static_finding.line}-{static_finding.code}",
|
||||
agent=AgentName.STYLE, # Static analysis is grouped under style
|
||||
file=static_finding.file,
|
||||
line_start=static_finding.line,
|
||||
line_end=static_finding.end_line or static_finding.line,
|
||||
severity=static_finding.severity,
|
||||
confidence=0.95, # Static analysis has high confidence
|
||||
title=f"[{static_finding.tool}] {static_finding.code}",
|
||||
description=static_finding.message,
|
||||
reasoning=f"Detected by {static_finding.tool} static analysis",
|
||||
suggestion="Fix the issue as indicated by the tool"
|
||||
if static_finding.fix_available
|
||||
else None,
|
||||
references=[str(static_finding.extra.get("url"))]
|
||||
if static_finding.extra and static_finding.extra.get("url") is not None
|
||||
else [],
|
||||
prompt_version=prompt_version,
|
||||
static_analysis_context={"tool": static_finding.tool, "code": static_finding.code},
|
||||
)
|
||||
|
||||
|
||||
async def run_static_analysis(
|
||||
diff: ParsedDiff,
|
||||
work_dir: Path | None = None,
|
||||
config: StaticAnalysisConfig | None = None,
|
||||
) -> StaticAnalysisResult:
|
||||
runner = StaticAnalysisRunner(config)
|
||||
return await runner.run(diff, work_dir)
|
||||
26
src/arbiter/deliberation/__init__.py
Normal file
26
src/arbiter/deliberation/__init__.py
Normal file
@@ -0,0 +1,26 @@
|
||||
"""Deliberation module for merging and synthesizing review findings."""
|
||||
|
||||
from arbiter.deliberation.conflicts import Conflict, ConflictDetector, ConflictNature
|
||||
from arbiter.deliberation.coordinator import (
|
||||
Coordinator,
|
||||
DeliberationResult,
|
||||
DeliberationStep,
|
||||
StepType,
|
||||
)
|
||||
from arbiter.deliberation.merger import FindingGroup, FindingMerger, MergedFindings
|
||||
from arbiter.deliberation.synthesis import ConflictSynthesizer, Resolution
|
||||
|
||||
__all__ = [
|
||||
"Conflict",
|
||||
"ConflictDetector",
|
||||
"ConflictNature",
|
||||
"ConflictSynthesizer",
|
||||
"Coordinator",
|
||||
"DeliberationResult",
|
||||
"DeliberationStep",
|
||||
"FindingGroup",
|
||||
"FindingMerger",
|
||||
"MergedFindings",
|
||||
"Resolution",
|
||||
"StepType",
|
||||
]
|
||||
256
src/arbiter/deliberation/conflicts.py
Normal file
256
src/arbiter/deliberation/conflicts.py
Normal file
@@ -0,0 +1,256 @@
|
||||
"""Conflict detection for opposing agent recommendations."""
|
||||
|
||||
from enum import StrEnum
|
||||
from typing import ClassVar
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from arbiter.models import Finding, Severity
|
||||
|
||||
|
||||
class ConflictNature(StrEnum):
|
||||
"""Nature of a conflict between findings."""
|
||||
|
||||
CONTRADICTORY = "contradictory" # Directly opposing recommendations
|
||||
TRADE_OFF = "trade_off" # Both valid but competing concerns
|
||||
OVERLAPPING = "overlapping" # Same issue from different perspectives
|
||||
|
||||
|
||||
class Conflict(BaseModel):
|
||||
"""A detected conflict between two or more findings."""
|
||||
|
||||
id: str = Field(description="Unique identifier for this conflict")
|
||||
finding_ids: list[str] = Field(description="IDs of conflicting findings")
|
||||
nature: ConflictNature = Field(description="Type of conflict")
|
||||
description: str = Field(description="Description of the conflict")
|
||||
severity_weight: float = Field(
|
||||
ge=0.0, le=1.0, description="How significant this conflict is (0-1)"
|
||||
)
|
||||
resolution: str | None = Field(default=None, description="How the conflict was resolved")
|
||||
winning_finding_id: str | None = Field(
|
||||
default=None, description="ID of the finding that won (if applicable)"
|
||||
)
|
||||
|
||||
|
||||
class ConflictDetector:
|
||||
"""Detects conflicts between findings from different agents."""
|
||||
|
||||
# Keywords that indicate opposing recommendations
|
||||
POSITIVE_KEYWORDS: ClassVar[set[str]] = {
|
||||
"add",
|
||||
"increase",
|
||||
"enable",
|
||||
"use",
|
||||
"implement",
|
||||
"include",
|
||||
"expand",
|
||||
"more",
|
||||
"validate",
|
||||
"check",
|
||||
}
|
||||
NEGATIVE_KEYWORDS: ClassVar[set[str]] = {
|
||||
"remove",
|
||||
"decrease",
|
||||
"disable",
|
||||
"avoid",
|
||||
"simplify",
|
||||
"exclude",
|
||||
"reduce",
|
||||
"less",
|
||||
"skip",
|
||||
"delete",
|
||||
}
|
||||
|
||||
# Agent pairs that commonly have trade-offs
|
||||
TRADE_OFF_PAIRS: ClassVar[dict[tuple[str, str], str]] = {
|
||||
("security", "complexity"): "Security measures often add complexity",
|
||||
("security", "style"): "Security code may sacrifice readability",
|
||||
("complexity", "style"): "Simplification may reduce clarity",
|
||||
}
|
||||
|
||||
# Severity weights for prioritization
|
||||
SEVERITY_WEIGHTS: ClassVar[dict[Severity, float]] = {
|
||||
Severity.CRITICAL: 1.0,
|
||||
Severity.HIGH: 0.8,
|
||||
Severity.MEDIUM: 0.5,
|
||||
Severity.LOW: 0.3,
|
||||
Severity.INFO: 0.1,
|
||||
}
|
||||
|
||||
def detect_conflicts(self, findings: list[Finding]) -> list[Conflict]:
|
||||
"""Detect conflicts among a list of findings.
|
||||
|
||||
Args:
|
||||
findings: List of findings to analyze for conflicts.
|
||||
|
||||
Returns:
|
||||
List of detected conflicts.
|
||||
"""
|
||||
conflicts: list[Conflict] = []
|
||||
seen_pairs: set[tuple[str, str]] = set()
|
||||
|
||||
# Compare each pair of findings
|
||||
for i, f1 in enumerate(findings):
|
||||
for f2 in findings[i + 1 :]:
|
||||
# Skip if same agent
|
||||
if f1.agent == f2.agent:
|
||||
continue
|
||||
|
||||
# Skip if already processed this pair
|
||||
ids = sorted([f1.id, f2.id])
|
||||
pair_key = (ids[0], ids[1])
|
||||
if pair_key in seen_pairs:
|
||||
continue
|
||||
seen_pairs.add(pair_key)
|
||||
|
||||
# Check for conflicts
|
||||
conflict = self._detect_conflict_pair(f1, f2)
|
||||
if conflict:
|
||||
conflicts.append(conflict)
|
||||
|
||||
return conflicts
|
||||
|
||||
def _detect_conflict_pair(self, f1: Finding, f2: Finding) -> Conflict | None:
|
||||
"""Detect conflict between two findings."""
|
||||
# Check for location overlap first
|
||||
if f1.file != f2.file:
|
||||
return None
|
||||
|
||||
if not self._lines_overlap(f1, f2):
|
||||
return None
|
||||
|
||||
# Check for contradictory recommendations
|
||||
if self._is_contradictory(f1, f2):
|
||||
return self._create_conflict(f1, f2, ConflictNature.CONTRADICTORY)
|
||||
|
||||
# Check for known trade-off patterns
|
||||
if self._is_trade_off(f1, f2):
|
||||
return self._create_conflict(f1, f2, ConflictNature.TRADE_OFF)
|
||||
|
||||
# Check for overlapping concerns
|
||||
if self._is_overlapping(f1, f2):
|
||||
return self._create_conflict(f1, f2, ConflictNature.OVERLAPPING)
|
||||
|
||||
return None
|
||||
|
||||
def _lines_overlap(self, f1: Finding, f2: Finding) -> bool:
|
||||
# Allow some proximity (within 3 lines)
|
||||
proximity = 3
|
||||
return not (
|
||||
f1.line_end + proximity < f2.line_start or f2.line_end + proximity < f1.line_start
|
||||
)
|
||||
|
||||
def _is_contradictory(self, f1: Finding, f2: Finding) -> bool:
|
||||
"""Check if findings have contradictory recommendations."""
|
||||
if not f1.suggestion or not f2.suggestion:
|
||||
return False
|
||||
|
||||
# Check if one uses positive and other uses negative keywords
|
||||
s1_lower = f1.suggestion.lower()
|
||||
s2_lower = f2.suggestion.lower()
|
||||
|
||||
s1_positive = any(kw in s1_lower for kw in self.POSITIVE_KEYWORDS)
|
||||
s1_negative = any(kw in s1_lower for kw in self.NEGATIVE_KEYWORDS)
|
||||
s2_positive = any(kw in s2_lower for kw in self.POSITIVE_KEYWORDS)
|
||||
s2_negative = any(kw in s2_lower for kw in self.NEGATIVE_KEYWORDS)
|
||||
|
||||
# Contradictory if one is positive-only and other is negative-only
|
||||
return (s1_positive and not s1_negative and s2_negative and not s2_positive) or (
|
||||
s1_negative and not s1_positive and s2_positive and not s2_negative
|
||||
)
|
||||
|
||||
def _is_trade_off(self, f1: Finding, f2: Finding) -> bool:
|
||||
agents = sorted([f1.agent.value, f2.agent.value])
|
||||
agent_pair = (agents[0], agents[1])
|
||||
# Check both orderings since pairs may be defined in either order
|
||||
return agent_pair in self.TRADE_OFF_PAIRS or (agents[1], agents[0]) in self.TRADE_OFF_PAIRS
|
||||
|
||||
def _is_overlapping(self, f1: Finding, f2: Finding) -> bool:
|
||||
"""Check if findings cover the same issue from different angles."""
|
||||
# Consider overlapping if titles are similar
|
||||
title_words_1 = set(f1.title.lower().split())
|
||||
title_words_2 = set(f2.title.lower().split())
|
||||
|
||||
# Remove common stop words
|
||||
stop_words = {"the", "a", "an", "in", "on", "at", "for", "to", "of", "is", "are"}
|
||||
title_words_1 -= stop_words
|
||||
title_words_2 -= stop_words
|
||||
|
||||
if not title_words_1 or not title_words_2:
|
||||
return False
|
||||
|
||||
# Check for significant word overlap (Jaccard similarity > 0.3)
|
||||
intersection = len(title_words_1 & title_words_2)
|
||||
union = len(title_words_1 | title_words_2)
|
||||
|
||||
return intersection / union > 0.3
|
||||
|
||||
def _create_conflict(self, f1: Finding, f2: Finding, nature: ConflictNature) -> Conflict:
|
||||
"""Create a Conflict object for two findings."""
|
||||
# Calculate severity weight based on findings
|
||||
weight = max(
|
||||
self.SEVERITY_WEIGHTS.get(f1.severity, 0.5),
|
||||
self.SEVERITY_WEIGHTS.get(f2.severity, 0.5),
|
||||
)
|
||||
|
||||
# Generate description based on nature
|
||||
if nature == ConflictNature.CONTRADICTORY:
|
||||
description = (
|
||||
f"Contradictory recommendations: {f1.agent.value} agent suggests '{f1.suggestion}' "
|
||||
f"while {f2.agent.value} agent suggests '{f2.suggestion}'"
|
||||
)
|
||||
elif nature == ConflictNature.TRADE_OFF:
|
||||
agents = sorted([f1.agent.value, f2.agent.value])
|
||||
agent_pair = (agents[0], agents[1])
|
||||
reason = self.TRADE_OFF_PAIRS.get(agent_pair, "Competing concerns")
|
||||
description = f"Trade-off between {f1.agent.value} and {f2.agent.value}: {reason}"
|
||||
else: # OVERLAPPING
|
||||
description = (
|
||||
f"Overlapping concerns: both {f1.agent.value} and {f2.agent.value} "
|
||||
f"flagged similar issues at this location"
|
||||
)
|
||||
|
||||
return Conflict(
|
||||
id=f"conflict-{f1.id[:8]}-{f2.id[:8]}",
|
||||
finding_ids=[f1.id, f2.id],
|
||||
nature=nature,
|
||||
description=description,
|
||||
severity_weight=weight,
|
||||
)
|
||||
|
||||
def resolve_by_severity(self, conflict: Conflict, findings: list[Finding]) -> Conflict:
|
||||
"""Resolve a conflict by choosing the higher severity finding.
|
||||
|
||||
Args:
|
||||
conflict: The conflict to resolve.
|
||||
findings: List of all findings (to look up by ID).
|
||||
|
||||
Returns:
|
||||
Updated conflict with resolution.
|
||||
"""
|
||||
# Find the findings
|
||||
conflict_findings = [f for f in findings if f.id in conflict.finding_ids]
|
||||
|
||||
if not conflict_findings:
|
||||
return conflict
|
||||
|
||||
# Sort by severity weight, then confidence
|
||||
sorted_findings = sorted(
|
||||
conflict_findings,
|
||||
key=lambda f: (
|
||||
-self.SEVERITY_WEIGHTS.get(f.severity, 0.5),
|
||||
-f.confidence,
|
||||
),
|
||||
)
|
||||
|
||||
winner = sorted_findings[0]
|
||||
|
||||
return Conflict(
|
||||
id=conflict.id,
|
||||
finding_ids=conflict.finding_ids,
|
||||
nature=conflict.nature,
|
||||
description=conflict.description,
|
||||
severity_weight=conflict.severity_weight,
|
||||
resolution=f"Resolved by severity: {winner.agent.value} finding takes precedence",
|
||||
winning_finding_id=winner.id,
|
||||
)
|
||||
286
src/arbiter/deliberation/coordinator.py
Normal file
286
src/arbiter/deliberation/coordinator.py
Normal file
@@ -0,0 +1,286 @@
|
||||
"""Coordinator for deliberation process."""
|
||||
|
||||
from datetime import UTC, datetime
|
||||
from enum import StrEnum
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from arbiter.deliberation.conflicts import Conflict, ConflictDetector
|
||||
from arbiter.deliberation.merger import FindingMerger, MergedFindings
|
||||
from arbiter.deliberation.synthesis import ConflictSynthesizer, Resolution
|
||||
from arbiter.llm.client import LLMClient
|
||||
from arbiter.models import Finding, ReviewResult, Severity, Verdict
|
||||
|
||||
|
||||
class StepType(StrEnum):
|
||||
"""Types of deliberation steps."""
|
||||
|
||||
MERGE = "merge"
|
||||
CONFLICT_DETECTION = "conflict_detection"
|
||||
SYNTHESIS = "synthesis"
|
||||
VERDICT = "verdict"
|
||||
|
||||
|
||||
class DeliberationStep(BaseModel):
|
||||
"""A single step in the deliberation process."""
|
||||
|
||||
step_type: StepType = Field(description="Type of deliberation step")
|
||||
timestamp: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
||||
description: str = Field(description="What happened in this step")
|
||||
details: dict[str, Any] | None = Field(default=None, description="Additional details")
|
||||
|
||||
|
||||
class DeliberationResult(BaseModel):
|
||||
"""Complete result of the deliberation process."""
|
||||
|
||||
findings: list[Finding] = Field(default_factory=list, description="All unique findings")
|
||||
merged: MergedFindings | None = Field(default=None, description="Merged findings structure")
|
||||
conflicts: list[Conflict] = Field(default_factory=list, description="Detected conflicts")
|
||||
resolutions: list[Resolution] = Field(default_factory=list, description="Conflict resolutions")
|
||||
steps: list[DeliberationStep] = Field(default_factory=list, description="Deliberation log")
|
||||
verdict: Verdict = Field(default=Verdict.COMMENT, description="Final verdict")
|
||||
verdict_confidence: float = Field(
|
||||
ge=0.0, le=1.0, default=0.5, description="Confidence in verdict"
|
||||
)
|
||||
verdict_reasoning: str = Field(default="", description="Explanation for verdict")
|
||||
total_findings: int = Field(default=0, description="Total number of findings")
|
||||
critical_count: int = Field(default=0, description="Number of critical findings")
|
||||
high_count: int = Field(default=0, description="Number of high severity findings")
|
||||
tokens_used: int = Field(default=0, description="Tokens used for synthesis")
|
||||
cost_usd: float = Field(default=0.0, description="Cost of synthesis")
|
||||
|
||||
|
||||
class VerdictConfig(BaseModel):
|
||||
"""Configuration for verdict determination."""
|
||||
|
||||
request_changes_critical_threshold: int = Field(
|
||||
default=1, description="Number of critical findings to request changes"
|
||||
)
|
||||
request_changes_high_threshold: int = Field(
|
||||
default=3, description="Number of high findings to request changes"
|
||||
)
|
||||
comment_high_threshold: int = Field(
|
||||
default=1, description="Number of high findings to add a comment"
|
||||
)
|
||||
comment_medium_threshold: int = Field(
|
||||
default=5, description="Number of medium findings to add a comment"
|
||||
)
|
||||
|
||||
|
||||
class Coordinator:
|
||||
"""Orchestrates the deliberation process."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
llm_client: LLMClient | None = None,
|
||||
synthesis_model: str = "gpt-4o-mini",
|
||||
verdict_config: VerdictConfig | None = None,
|
||||
) -> None:
|
||||
self.llm_client = llm_client
|
||||
self.synthesis_model = synthesis_model
|
||||
self.verdict_config = verdict_config or VerdictConfig()
|
||||
self.merger = FindingMerger()
|
||||
self.conflict_detector = ConflictDetector()
|
||||
self.synthesizer: ConflictSynthesizer | None = None
|
||||
if llm_client:
|
||||
self.synthesizer = ConflictSynthesizer(llm_client, synthesis_model)
|
||||
|
||||
async def deliberate(
|
||||
self,
|
||||
agent_results: list[ReviewResult],
|
||||
static_findings: list[Finding] | None = None,
|
||||
) -> DeliberationResult:
|
||||
"""Run the full deliberation process.
|
||||
|
||||
Args:
|
||||
agent_results: Results from agent reviews.
|
||||
static_findings: Findings from static analysis.
|
||||
|
||||
Returns:
|
||||
DeliberationResult with findings, conflicts, and verdict.
|
||||
"""
|
||||
result = DeliberationResult()
|
||||
|
||||
all_agent_findings: list[Finding] = []
|
||||
for agent_result in agent_results:
|
||||
all_agent_findings.extend(agent_result.findings)
|
||||
|
||||
merged = self.merger.merge(all_agent_findings, static_findings)
|
||||
result.merged = merged
|
||||
result.findings = merged.unique_findings
|
||||
result.total_findings = len(merged.unique_findings)
|
||||
|
||||
result.steps.append(
|
||||
DeliberationStep(
|
||||
step_type=StepType.MERGE,
|
||||
description=f"Merged {len(all_agent_findings)} agent findings with {len(static_findings or [])} static findings",
|
||||
details={
|
||||
"groups": len(merged.groups),
|
||||
"unique": len(merged.unique_findings),
|
||||
"duplicates_removed": merged.duplicates_removed,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
conflicts = self.conflict_detector.detect_conflicts(merged.unique_findings)
|
||||
result.conflicts = conflicts
|
||||
|
||||
result.steps.append(
|
||||
DeliberationStep(
|
||||
step_type=StepType.CONFLICT_DETECTION,
|
||||
description=f"Detected {len(conflicts)} conflicts among findings",
|
||||
details={
|
||||
"by_nature": self._count_by_nature(conflicts),
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
if conflicts and self.synthesizer:
|
||||
resolutions, tokens, cost = await self._synthesize_conflicts(
|
||||
conflicts, merged.unique_findings
|
||||
)
|
||||
result.resolutions = resolutions
|
||||
result.tokens_used = tokens
|
||||
result.cost_usd = cost
|
||||
|
||||
result.steps.append(
|
||||
DeliberationStep(
|
||||
step_type=StepType.SYNTHESIS,
|
||||
description=f"Synthesized {len(resolutions)} conflict resolutions",
|
||||
details={
|
||||
"tokens_used": tokens,
|
||||
"cost_usd": cost,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
self._count_severities(result)
|
||||
verdict, confidence, reasoning = self._determine_verdict(result)
|
||||
result.verdict = verdict
|
||||
result.verdict_confidence = confidence
|
||||
result.verdict_reasoning = reasoning
|
||||
|
||||
result.steps.append(
|
||||
DeliberationStep(
|
||||
step_type=StepType.VERDICT,
|
||||
description=f"Verdict: {verdict.value} (confidence: {confidence:.2f})",
|
||||
details={
|
||||
"critical_count": result.critical_count,
|
||||
"high_count": result.high_count,
|
||||
"reasoning": reasoning,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
def _count_by_nature(self, conflicts: list[Conflict]) -> dict[str, int]:
|
||||
counts: dict[str, int] = {}
|
||||
for conflict in conflicts:
|
||||
nature = conflict.nature.value
|
||||
counts[nature] = counts.get(nature, 0) + 1
|
||||
return counts
|
||||
|
||||
async def _synthesize_conflicts(
|
||||
self, conflicts: list[Conflict], findings: list[Finding]
|
||||
) -> tuple[list[Resolution], int, float]:
|
||||
"""Synthesize resolutions for conflicts that need it."""
|
||||
resolutions: list[Resolution] = []
|
||||
total_tokens = 0
|
||||
total_cost = 0.0
|
||||
|
||||
if not self.synthesizer:
|
||||
return resolutions, total_tokens, total_cost
|
||||
|
||||
for conflict in conflicts:
|
||||
if self.synthesizer.should_synthesize(conflict):
|
||||
resolution = await self.synthesizer.synthesize(conflict, findings)
|
||||
resolutions.append(resolution)
|
||||
else:
|
||||
# Use algorithmic resolution
|
||||
resolved = self.conflict_detector.resolve_by_severity(conflict, findings)
|
||||
resolutions.append(
|
||||
Resolution(
|
||||
conflict_id=conflict.id,
|
||||
decision="prefer_first"
|
||||
if resolved.winning_finding_id == conflict.finding_ids[0]
|
||||
else "prefer_second",
|
||||
reasoning=resolved.resolution or "Resolved by severity",
|
||||
confidence=0.8,
|
||||
)
|
||||
)
|
||||
|
||||
return resolutions, total_tokens, total_cost
|
||||
|
||||
def _count_severities(self, result: DeliberationResult) -> None:
|
||||
for finding in result.findings:
|
||||
if finding.severity == Severity.CRITICAL:
|
||||
result.critical_count += 1
|
||||
elif finding.severity == Severity.HIGH:
|
||||
result.high_count += 1
|
||||
|
||||
def _determine_verdict(self, result: DeliberationResult) -> tuple[Verdict, float, str]:
|
||||
"""Determine the final verdict based on findings and conflicts.
|
||||
|
||||
Returns:
|
||||
Tuple of (verdict, confidence, reasoning).
|
||||
"""
|
||||
config = self.verdict_config
|
||||
|
||||
# Critical findings always trigger request_changes
|
||||
if result.critical_count >= config.request_changes_critical_threshold:
|
||||
return (
|
||||
Verdict.REQUEST_CHANGES,
|
||||
0.95,
|
||||
f"Found {result.critical_count} critical issue(s) requiring immediate attention",
|
||||
)
|
||||
|
||||
# High severity threshold
|
||||
if result.high_count >= config.request_changes_high_threshold:
|
||||
return (
|
||||
Verdict.REQUEST_CHANGES,
|
||||
0.85,
|
||||
f"Found {result.high_count} high severity issue(s) that should be addressed",
|
||||
)
|
||||
|
||||
# Count medium findings
|
||||
medium_count = sum(1 for f in result.findings if f.severity == Severity.MEDIUM)
|
||||
|
||||
# Comment thresholds
|
||||
if result.high_count >= config.comment_high_threshold:
|
||||
return (
|
||||
Verdict.COMMENT,
|
||||
0.75,
|
||||
f"Found {result.high_count} high severity issue(s) worth discussing",
|
||||
)
|
||||
|
||||
if medium_count >= config.comment_medium_threshold:
|
||||
return (
|
||||
Verdict.COMMENT,
|
||||
0.7,
|
||||
f"Found {medium_count} medium severity issue(s) worth discussing",
|
||||
)
|
||||
|
||||
# Unresolved conflicts should trigger comment
|
||||
unresolved_conflicts = len(result.conflicts) - len(result.resolutions)
|
||||
if unresolved_conflicts > 0:
|
||||
return (
|
||||
Verdict.COMMENT,
|
||||
0.65,
|
||||
f"Found {unresolved_conflicts} unresolved conflict(s) between agents",
|
||||
)
|
||||
|
||||
# Default: approve
|
||||
if result.total_findings == 0:
|
||||
return (
|
||||
Verdict.APPROVE,
|
||||
0.95,
|
||||
"No issues found",
|
||||
)
|
||||
|
||||
return (
|
||||
Verdict.APPROVE,
|
||||
0.8,
|
||||
f"Found {result.total_findings} minor issue(s), none blocking",
|
||||
)
|
||||
207
src/arbiter/deliberation/merger.py
Normal file
207
src/arbiter/deliberation/merger.py
Normal file
@@ -0,0 +1,207 @@
|
||||
"""Finding merger for grouping and deduplicating findings."""
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from arbiter.models import Finding
|
||||
|
||||
|
||||
class FindingGroup(BaseModel):
|
||||
"""A group of related findings at the same location."""
|
||||
|
||||
file: str = Field(description="File path")
|
||||
line_start: int = Field(description="Starting line of the group")
|
||||
line_end: int = Field(description="Ending line of the group")
|
||||
findings: list[Finding] = Field(default_factory=list, description="Findings in this group")
|
||||
|
||||
@property
|
||||
def primary_finding(self) -> Finding | None:
|
||||
if not self.findings:
|
||||
return None
|
||||
# Sort by severity (critical > high > medium > low > info)
|
||||
severity_order = {"critical": 0, "high": 1, "medium": 2, "low": 3, "info": 4}
|
||||
return min(
|
||||
self.findings, key=lambda f: (severity_order.get(f.severity.value, 5), -f.confidence)
|
||||
)
|
||||
|
||||
@property
|
||||
def agents(self) -> list[str]:
|
||||
return list({f.agent for f in self.findings})
|
||||
|
||||
|
||||
class MergedFindings(BaseModel):
|
||||
"""Result of merging findings from multiple sources."""
|
||||
|
||||
groups: list[FindingGroup] = Field(default_factory=list, description="Grouped findings")
|
||||
unique_findings: list[Finding] = Field(
|
||||
default_factory=list, description="Deduplicated list of all findings"
|
||||
)
|
||||
duplicates_removed: int = Field(default=0, description="Number of duplicates removed")
|
||||
|
||||
def get_groups_for_file(self, file: str) -> list[FindingGroup]:
|
||||
return [g for g in self.groups if g.file == file]
|
||||
|
||||
def get_all_findings(self) -> list[Finding]:
|
||||
return self.unique_findings
|
||||
|
||||
|
||||
class FindingMerger:
|
||||
"""Merges and deduplicates findings from static analysis and agents."""
|
||||
|
||||
def __init__(self, proximity_threshold: int = 5) -> None:
|
||||
self.proximity_threshold = proximity_threshold
|
||||
|
||||
def merge(
|
||||
self,
|
||||
agent_findings: list[Finding],
|
||||
static_findings: list[Finding] | None = None,
|
||||
) -> MergedFindings:
|
||||
"""Merge findings from agents and static analysis.
|
||||
|
||||
Args:
|
||||
agent_findings: Findings from AI agents.
|
||||
static_findings: Findings converted from static analysis tools.
|
||||
|
||||
Returns:
|
||||
MergedFindings with grouped and deduplicated findings.
|
||||
"""
|
||||
all_findings = list(agent_findings)
|
||||
if static_findings:
|
||||
all_findings.extend(static_findings)
|
||||
|
||||
if not all_findings:
|
||||
return MergedFindings()
|
||||
|
||||
# Deduplicate first
|
||||
unique, duplicates_count = self._deduplicate(all_findings)
|
||||
|
||||
# Group by proximity
|
||||
groups = self._group_by_location(unique)
|
||||
|
||||
return MergedFindings(
|
||||
groups=groups,
|
||||
unique_findings=unique,
|
||||
duplicates_removed=duplicates_count,
|
||||
)
|
||||
|
||||
def _deduplicate(self, findings: list[Finding]) -> tuple[list[Finding], int]:
|
||||
"""Remove duplicate findings.
|
||||
|
||||
Two findings are considered duplicates if they have:
|
||||
- Same file
|
||||
- Same or overlapping lines
|
||||
- Similar title or message (normalized)
|
||||
"""
|
||||
if not findings:
|
||||
return [], 0
|
||||
|
||||
unique: list[Finding] = []
|
||||
duplicates = 0
|
||||
|
||||
for finding in findings:
|
||||
is_duplicate = False
|
||||
|
||||
for existing in unique:
|
||||
if self._is_duplicate(finding, existing):
|
||||
is_duplicate = True
|
||||
# Keep the one with higher confidence or more detail
|
||||
if finding.confidence > existing.confidence or (
|
||||
finding.confidence == existing.confidence
|
||||
and len(finding.description) > len(existing.description)
|
||||
):
|
||||
unique.remove(existing)
|
||||
unique.append(finding)
|
||||
duplicates += 1
|
||||
break
|
||||
|
||||
if not is_duplicate:
|
||||
unique.append(finding)
|
||||
|
||||
return unique, duplicates
|
||||
|
||||
def _is_duplicate(self, a: Finding, b: Finding) -> bool:
|
||||
"""Check if two findings are duplicates."""
|
||||
# Must be in same file
|
||||
if a.file != b.file:
|
||||
return False
|
||||
|
||||
# Check line overlap
|
||||
if not self._lines_overlap(a.line_start, a.line_end, b.line_start, b.line_end):
|
||||
return False
|
||||
|
||||
# Check content similarity
|
||||
return self._content_similar(a, b)
|
||||
|
||||
def _lines_overlap(self, a_start: int, a_end: int, b_start: int, b_end: int) -> bool:
|
||||
return not (a_end < b_start or b_end < a_start)
|
||||
|
||||
def _content_similar(self, a: Finding, b: Finding) -> bool:
|
||||
"""Check if two findings have similar content."""
|
||||
# Normalize titles for comparison
|
||||
title_a = self._normalize(a.title)
|
||||
title_b = self._normalize(b.title)
|
||||
|
||||
# Same title is a strong indicator
|
||||
if title_a == title_b:
|
||||
return True
|
||||
|
||||
# Check if one title contains the other
|
||||
if title_a in title_b or title_b in title_a:
|
||||
return True
|
||||
|
||||
# Check for same static analysis code (e.g., both are "S101")
|
||||
return bool(
|
||||
a.static_analysis_context
|
||||
and b.static_analysis_context
|
||||
and a.static_analysis_context.get("code") == b.static_analysis_context.get("code")
|
||||
)
|
||||
|
||||
def _normalize(self, text: str) -> str:
|
||||
# Remove tool prefixes like "[ruff]", "[mypy]"
|
||||
import re
|
||||
|
||||
text = re.sub(r"\[[\w-]+\]\s*", "", text)
|
||||
# Lowercase and strip
|
||||
return text.lower().strip()
|
||||
|
||||
def _group_by_location(self, findings: list[Finding]) -> list[FindingGroup]:
|
||||
"""Group findings by file and proximity."""
|
||||
if not findings:
|
||||
return []
|
||||
|
||||
# Sort by file, then line
|
||||
sorted_findings = sorted(findings, key=lambda f: (f.file, f.line_start))
|
||||
|
||||
groups: list[FindingGroup] = []
|
||||
current_group: FindingGroup | None = None
|
||||
|
||||
for finding in sorted_findings:
|
||||
if current_group is None:
|
||||
# Start new group
|
||||
current_group = FindingGroup(
|
||||
file=finding.file,
|
||||
line_start=finding.line_start,
|
||||
line_end=finding.line_end,
|
||||
findings=[finding],
|
||||
)
|
||||
elif (
|
||||
current_group.file == finding.file
|
||||
and finding.line_start <= current_group.line_end + self.proximity_threshold
|
||||
):
|
||||
# Extend current group
|
||||
current_group.findings.append(finding)
|
||||
current_group.line_end = max(current_group.line_end, finding.line_end)
|
||||
else:
|
||||
# Save current group and start new one
|
||||
groups.append(current_group)
|
||||
current_group = FindingGroup(
|
||||
file=finding.file,
|
||||
line_start=finding.line_start,
|
||||
line_end=finding.line_end,
|
||||
findings=[finding],
|
||||
)
|
||||
|
||||
# Don't forget last group
|
||||
if current_group is not None:
|
||||
groups.append(current_group)
|
||||
|
||||
return groups
|
||||
213
src/arbiter/deliberation/synthesis.py
Normal file
213
src/arbiter/deliberation/synthesis.py
Normal file
@@ -0,0 +1,213 @@
|
||||
"""LLM-based synthesis for ambiguous conflicts."""
|
||||
|
||||
import json
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from arbiter.deliberation.conflicts import Conflict, ConflictNature
|
||||
from arbiter.llm.client import LLMClient
|
||||
from arbiter.models import Finding
|
||||
|
||||
|
||||
class Resolution(BaseModel):
|
||||
"""Resolution for a conflict produced by LLM synthesis."""
|
||||
|
||||
conflict_id: str = Field(description="ID of the resolved conflict")
|
||||
decision: str = Field(
|
||||
description="The decision made (keep_both, prefer_first, prefer_second, merge)"
|
||||
)
|
||||
reasoning: str = Field(description="Explanation of why this decision was made")
|
||||
merged_suggestion: str | None = Field(
|
||||
default=None, description="Combined suggestion if decision is merge"
|
||||
)
|
||||
confidence: float = Field(ge=0.0, le=1.0, description="Confidence in this resolution")
|
||||
|
||||
|
||||
SYNTHESIS_PROMPT = """You are a code review coordinator synthesizing findings from multiple specialized agents.
|
||||
|
||||
Two agents have provided conflicting feedback on the same code location. Your job is to:
|
||||
1. Understand both perspectives
|
||||
2. Determine the best resolution
|
||||
3. Provide clear reasoning
|
||||
|
||||
## Conflict Information
|
||||
|
||||
**Nature:** {nature}
|
||||
**Description:** {description}
|
||||
|
||||
## Finding 1 ({agent1})
|
||||
- **Severity:** {severity1}
|
||||
- **Confidence:** {confidence1}
|
||||
- **Title:** {title1}
|
||||
- **Description:** {desc1}
|
||||
- **Reasoning:** {reasoning1}
|
||||
- **Suggestion:** {suggestion1}
|
||||
|
||||
## Finding 2 ({agent2})
|
||||
- **Severity:** {severity2}
|
||||
- **Confidence:** {confidence2}
|
||||
- **Title:** {title2}
|
||||
- **Description:** {desc2}
|
||||
- **Reasoning:** {reasoning2}
|
||||
- **Suggestion:** {suggestion2}
|
||||
|
||||
## Instructions
|
||||
|
||||
Analyze both findings and determine the best resolution. Consider:
|
||||
- Severity and confidence levels
|
||||
- Whether the concerns can be addressed together
|
||||
- The practical impact on code quality
|
||||
- Security concerns always take priority over style/complexity
|
||||
|
||||
Respond with a JSON object:
|
||||
```json
|
||||
{{
|
||||
"decision": "keep_both" | "prefer_first" | "prefer_second" | "merge",
|
||||
"reasoning": "Clear explanation of your decision",
|
||||
"merged_suggestion": "Combined suggestion if decision is merge, otherwise null",
|
||||
"confidence": 0.0-1.0
|
||||
}}
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
class ConflictSynthesizer:
|
||||
"""Uses LLM to synthesize resolutions for ambiguous conflicts."""
|
||||
|
||||
def __init__(self, llm_client: LLMClient, model: str = "gpt-4o-mini") -> None:
|
||||
self.llm_client = llm_client
|
||||
self.model = model
|
||||
|
||||
async def synthesize(self, conflict: Conflict, findings: list[Finding]) -> Resolution:
|
||||
"""Synthesize a resolution for a conflict using LLM.
|
||||
|
||||
Args:
|
||||
conflict: The conflict to resolve.
|
||||
findings: All findings (to look up by ID).
|
||||
|
||||
Returns:
|
||||
Resolution with decision and reasoning.
|
||||
"""
|
||||
# Find the two conflicting findings
|
||||
f1, f2 = self._get_conflict_findings(conflict, findings)
|
||||
|
||||
if f1 is None or f2 is None:
|
||||
# Can't synthesize without both findings
|
||||
return Resolution(
|
||||
conflict_id=conflict.id,
|
||||
decision="keep_both",
|
||||
reasoning="Could not find both findings for synthesis",
|
||||
confidence=0.5,
|
||||
)
|
||||
|
||||
# Build the prompt
|
||||
prompt = self._build_prompt(conflict, f1, f2)
|
||||
|
||||
# Call LLM
|
||||
response = await self.llm_client.complete(
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
model=self.model,
|
||||
)
|
||||
|
||||
# Parse response
|
||||
return self._parse_response(conflict.id, response.content, f1, f2)
|
||||
|
||||
def _get_conflict_findings(
|
||||
self, conflict: Conflict, findings: list[Finding]
|
||||
) -> tuple[Finding | None, Finding | None]:
|
||||
"""Get the two findings involved in a conflict."""
|
||||
f1 = None
|
||||
f2 = None
|
||||
|
||||
for finding in findings:
|
||||
if finding.id == conflict.finding_ids[0]:
|
||||
f1 = finding
|
||||
elif finding.id == conflict.finding_ids[1]:
|
||||
f2 = finding
|
||||
|
||||
return f1, f2
|
||||
|
||||
def _build_prompt(self, conflict: Conflict, f1: Finding, f2: Finding) -> str:
|
||||
"""Build the synthesis prompt."""
|
||||
return SYNTHESIS_PROMPT.format(
|
||||
nature=conflict.nature.value,
|
||||
description=conflict.description,
|
||||
agent1=f1.agent.value,
|
||||
severity1=f1.severity.value,
|
||||
confidence1=f1.confidence,
|
||||
title1=f1.title,
|
||||
desc1=f1.description,
|
||||
reasoning1=f1.reasoning,
|
||||
suggestion1=f1.suggestion or "None provided",
|
||||
agent2=f2.agent.value,
|
||||
severity2=f2.severity.value,
|
||||
confidence2=f2.confidence,
|
||||
title2=f2.title,
|
||||
desc2=f2.description,
|
||||
reasoning2=f2.reasoning,
|
||||
suggestion2=f2.suggestion or "None provided",
|
||||
)
|
||||
|
||||
def _parse_response(
|
||||
self, conflict_id: str, content: str, f1: Finding, f2: Finding
|
||||
) -> Resolution:
|
||||
"""Parse LLM response into a Resolution."""
|
||||
# Try to extract JSON from the response
|
||||
try:
|
||||
# Look for JSON block
|
||||
if "```json" in content:
|
||||
json_str = content.split("```json")[1].split("```")[0].strip()
|
||||
elif "```" in content:
|
||||
json_str = content.split("```")[1].split("```")[0].strip()
|
||||
else:
|
||||
json_str = content.strip()
|
||||
|
||||
data = json.loads(json_str)
|
||||
|
||||
return Resolution(
|
||||
conflict_id=conflict_id,
|
||||
decision=data.get("decision", "keep_both"),
|
||||
reasoning=data.get("reasoning", "No reasoning provided"),
|
||||
merged_suggestion=data.get("merged_suggestion"),
|
||||
confidence=float(data.get("confidence", 0.7)),
|
||||
)
|
||||
except (json.JSONDecodeError, IndexError, ValueError):
|
||||
# Fallback: use heuristics
|
||||
return self._fallback_resolution(conflict_id, f1, f2)
|
||||
|
||||
def _fallback_resolution(self, conflict_id: str, f1: Finding, f2: Finding) -> Resolution:
|
||||
"""Provide a fallback resolution when LLM parsing fails."""
|
||||
# Prefer higher severity
|
||||
severity_order = {"critical": 0, "high": 1, "medium": 2, "low": 3, "info": 4}
|
||||
s1 = severity_order.get(f1.severity.value, 5)
|
||||
s2 = severity_order.get(f2.severity.value, 5)
|
||||
|
||||
if s1 < s2:
|
||||
return Resolution(
|
||||
conflict_id=conflict_id,
|
||||
decision="prefer_first",
|
||||
reasoning=f"Fallback: {f1.agent.value} finding has higher severity ({f1.severity.value})",
|
||||
confidence=0.6,
|
||||
)
|
||||
elif s2 < s1:
|
||||
return Resolution(
|
||||
conflict_id=conflict_id,
|
||||
decision="prefer_second",
|
||||
reasoning=f"Fallback: {f2.agent.value} finding has higher severity ({f2.severity.value})",
|
||||
confidence=0.6,
|
||||
)
|
||||
else:
|
||||
return Resolution(
|
||||
conflict_id=conflict_id,
|
||||
decision="keep_both",
|
||||
reasoning="Fallback: Both findings have equal severity, keeping both",
|
||||
confidence=0.5,
|
||||
)
|
||||
|
||||
def should_synthesize(self, conflict: Conflict) -> bool:
|
||||
# Always synthesize contradictory conflicts
|
||||
if conflict.nature == ConflictNature.CONTRADICTORY:
|
||||
return True
|
||||
|
||||
# Synthesize high-severity trade-offs, don't synthesize overlapping
|
||||
return conflict.nature == ConflictNature.TRADE_OFF and conflict.severity_weight >= 0.7
|
||||
42
tests/fixtures/conflict-contradictory.diff
vendored
Normal file
42
tests/fixtures/conflict-contradictory.diff
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
diff --git a/src/config.py b/src/config.py
|
||||
index 1234567..abcdefg 100644
|
||||
--- a/src/config.py
|
||||
+++ b/src/config.py
|
||||
@@ -1,5 +1,35 @@
|
||||
"""Configuration module."""
|
||||
|
||||
+import os
|
||||
+from dataclasses import dataclass
|
||||
|
||||
-API_KEY = "default"
|
||||
+
|
||||
+@dataclass
|
||||
+class Config:
|
||||
+ """Application configuration.
|
||||
+
|
||||
+ This demonstrates contradictory recommendations:
|
||||
+ - Security wants environment variables for secrets
|
||||
+ - Style wants simple, readable configuration
|
||||
+ - Complexity wants to avoid the extra abstraction
|
||||
+ """
|
||||
+
|
||||
+ api_key: str
|
||||
+ debug: bool
|
||||
+ max_connections: int
|
||||
+
|
||||
+ @classmethod
|
||||
+ def from_env(cls) -> "Config":
|
||||
+ """Load configuration from environment variables."""
|
||||
+ return cls(
|
||||
+ api_key=os.environ.get("API_KEY", ""),
|
||||
+ debug=os.environ.get("DEBUG", "false").lower() == "true",
|
||||
+ max_connections=int(os.environ.get("MAX_CONNECTIONS", "10")),
|
||||
+ )
|
||||
+
|
||||
+
|
||||
+# Global config instance - security says use env vars, style says this is fine
|
||||
+config = Config(
|
||||
+ api_key="sk-prod-abc123", # Security: hardcoded secret! Style: it's readable
|
||||
+ debug=True,
|
||||
+ max_connections=100,
|
||||
+)
|
||||
37
tests/fixtures/conflict-overlapping.diff
vendored
Normal file
37
tests/fixtures/conflict-overlapping.diff
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
diff --git a/src/handler.py b/src/handler.py
|
||||
index 1234567..abcdefg 100644
|
||||
--- a/src/handler.py
|
||||
+++ b/src/handler.py
|
||||
@@ -1,8 +1,30 @@
|
||||
"""Request handler module."""
|
||||
|
||||
+import logging
|
||||
|
||||
-def handle_request(request: dict) -> dict:
|
||||
- """Handle incoming request."""
|
||||
- return {"status": "ok"}
|
||||
+logger = logging.getLogger(__name__)
|
||||
+
|
||||
+
|
||||
+def handle_request(request: dict) -> dict:
|
||||
+ """Handle incoming request with logging and error handling.
|
||||
+
|
||||
+ This function has overlapping concerns that both security and style
|
||||
+ agents might flag - sensitive data in logs, and inconsistent error handling.
|
||||
+ """
|
||||
+ # Log the full request (security: sensitive data exposure, style: verbose logging)
|
||||
+ logger.debug(f"Received request: {request}")
|
||||
+
|
||||
+ user_id = request.get("user_id")
|
||||
+ action = request.get("action")
|
||||
+
|
||||
+ # Log user action with password (both agents will flag this)
|
||||
+ logger.info(f"User {user_id} performing {action}, auth: {request.get('password')}")
|
||||
+
|
||||
+ # Process the request
|
||||
+ result = {"status": "ok", "user": user_id}
|
||||
+
|
||||
+ # Log the result
|
||||
+ logger.debug(f"Returning result: {result}")
|
||||
+
|
||||
+ return result
|
||||
57
tests/fixtures/conflict-security-complexity.diff
vendored
Normal file
57
tests/fixtures/conflict-security-complexity.diff
vendored
Normal file
@@ -0,0 +1,57 @@
|
||||
diff --git a/src/validator.py b/src/validator.py
|
||||
index 1234567..abcdefg 100644
|
||||
--- a/src/validator.py
|
||||
+++ b/src/validator.py
|
||||
@@ -1,10 +1,45 @@
|
||||
"""Input validation module."""
|
||||
|
||||
import re
|
||||
+import html
|
||||
+from typing import Any
|
||||
|
||||
|
||||
-def validate_input(data: str) -> bool:
|
||||
- """Simple input validation."""
|
||||
- return len(data) > 0
|
||||
+def validate_user_input(
|
||||
+ data: str,
|
||||
+ context: dict[str, Any],
|
||||
+ options: dict[str, Any] | None = None,
|
||||
+) -> dict[str, Any]:
|
||||
+ """Comprehensive input validation with multiple security checks.
|
||||
+
|
||||
+ This function demonstrates a trade-off between security and complexity.
|
||||
+ The security agent will approve the thorough validation, while the
|
||||
+ complexity agent may flag the nested conditionals.
|
||||
+ """
|
||||
+ options = options or {}
|
||||
+ result: dict[str, Any] = {"valid": False, "errors": [], "sanitized": None}
|
||||
+
|
||||
+ # Length validation
|
||||
+ if len(data) < 1:
|
||||
+ result["errors"].append("Input cannot be empty")
|
||||
+ return result
|
||||
+
|
||||
+ if len(data) > options.get("max_length", 10000):
|
||||
+ result["errors"].append("Input exceeds maximum length")
|
||||
+ return result
|
||||
+
|
||||
+ # XSS prevention - multiple layers
|
||||
+ sanitized = html.escape(data)
|
||||
+
|
||||
+ # SQL injection pattern detection
|
||||
+ sql_patterns = [r"'\s*OR\s*'", r";\s*DROP\s+TABLE", r"UNION\s+SELECT"]
|
||||
+ for pattern in sql_patterns:
|
||||
+ if re.search(pattern, data, re.IGNORECASE):
|
||||
+ result["errors"].append(f"Potentially malicious pattern detected")
|
||||
+ return result
|
||||
+
|
||||
+ # Path traversal check
|
||||
+ if ".." in data or data.startswith("/"):
|
||||
+ if not options.get("allow_paths", False):
|
||||
+ result["errors"].append("Path characters not allowed")
|
||||
+ return result
|
||||
+
|
||||
+ result["valid"] = True
|
||||
+ result["sanitized"] = sanitized
|
||||
+ return result
|
||||
743
tests/test_deliberation.py
Normal file
743
tests/test_deliberation.py
Normal file
@@ -0,0 +1,743 @@
|
||||
"""Tests for deliberation module."""
|
||||
|
||||
import pytest
|
||||
|
||||
from arbiter.deliberation.conflicts import Conflict, ConflictDetector, ConflictNature
|
||||
from arbiter.deliberation.coordinator import Coordinator, StepType
|
||||
from arbiter.deliberation.merger import FindingGroup, FindingMerger
|
||||
from arbiter.deliberation.synthesis import ConflictSynthesizer
|
||||
from arbiter.models import AgentName, Finding, ReviewResult, Severity, Verdict
|
||||
|
||||
from .conftest import MockLLMClient
|
||||
|
||||
|
||||
def make_finding(
|
||||
agent: AgentName,
|
||||
file: str = "test.py",
|
||||
line_start: int = 10,
|
||||
line_end: int = 15,
|
||||
severity: Severity = Severity.MEDIUM,
|
||||
confidence: float = 0.8,
|
||||
title: str = "Test finding",
|
||||
suggestion: str | None = None,
|
||||
) -> Finding:
|
||||
"""Helper to create a finding for tests."""
|
||||
return Finding(
|
||||
id=f"{agent.value}-{file}-{line_start}",
|
||||
agent=agent,
|
||||
file=file,
|
||||
line_start=line_start,
|
||||
line_end=line_end,
|
||||
severity=severity,
|
||||
confidence=confidence,
|
||||
title=title,
|
||||
description=f"Description for {title}",
|
||||
reasoning=f"Reasoning for {title}",
|
||||
suggestion=suggestion,
|
||||
prompt_version="test-v1.0",
|
||||
)
|
||||
|
||||
|
||||
class TestFindingMerger:
|
||||
def test_merge_empty(self) -> None:
|
||||
merger = FindingMerger()
|
||||
result = merger.merge([], None)
|
||||
assert result.unique_findings == []
|
||||
assert result.groups == []
|
||||
assert result.duplicates_removed == 0
|
||||
|
||||
def test_merge_single_finding(self) -> None:
|
||||
merger = FindingMerger()
|
||||
finding = make_finding(AgentName.SECURITY)
|
||||
result = merger.merge([finding], None)
|
||||
|
||||
assert len(result.unique_findings) == 1
|
||||
assert len(result.groups) == 1
|
||||
assert result.groups[0].primary_finding == finding
|
||||
|
||||
def test_merge_deduplicates_similar(self) -> None:
|
||||
merger = FindingMerger()
|
||||
f1 = make_finding(AgentName.SECURITY, title="SQL Injection")
|
||||
f2 = make_finding(AgentName.STYLE, title="SQL Injection vulnerability")
|
||||
|
||||
result = merger.merge([f1, f2], None)
|
||||
|
||||
assert result.duplicates_removed == 1
|
||||
assert len(result.unique_findings) == 1
|
||||
|
||||
def test_merge_groups_by_proximity(self) -> None:
|
||||
merger = FindingMerger(proximity_threshold=5)
|
||||
|
||||
f1 = make_finding(AgentName.SECURITY, line_start=10, line_end=12)
|
||||
f2 = make_finding(AgentName.STYLE, line_start=14, line_end=16)
|
||||
f3 = make_finding(AgentName.COMPLEXITY, line_start=50, line_end=55)
|
||||
|
||||
result = merger.merge([f1, f2, f3], None)
|
||||
|
||||
assert len(result.groups) == 2 # f1+f2 in one group, f3 alone
|
||||
assert len(result.groups[0].findings) == 2
|
||||
assert len(result.groups[1].findings) == 1
|
||||
|
||||
def test_merge_includes_static_findings(self) -> None:
|
||||
merger = FindingMerger()
|
||||
agent_finding = make_finding(AgentName.SECURITY)
|
||||
static_finding = make_finding(
|
||||
AgentName.STYLE,
|
||||
title="[ruff] E501",
|
||||
line_start=100,
|
||||
)
|
||||
|
||||
result = merger.merge([agent_finding], [static_finding])
|
||||
|
||||
assert len(result.unique_findings) == 2
|
||||
assert len(result.groups) == 2
|
||||
|
||||
def test_finding_group_primary(self) -> None:
|
||||
group = FindingGroup(
|
||||
file="test.py",
|
||||
line_start=10,
|
||||
line_end=20,
|
||||
findings=[
|
||||
make_finding(AgentName.STYLE, severity=Severity.LOW),
|
||||
make_finding(AgentName.SECURITY, severity=Severity.HIGH),
|
||||
make_finding(AgentName.COMPLEXITY, severity=Severity.MEDIUM),
|
||||
],
|
||||
)
|
||||
|
||||
primary = group.primary_finding
|
||||
assert primary is not None
|
||||
assert primary.severity == Severity.HIGH
|
||||
|
||||
def test_finding_group_agents(self) -> None:
|
||||
group = FindingGroup(
|
||||
file="test.py",
|
||||
line_start=10,
|
||||
line_end=20,
|
||||
findings=[
|
||||
make_finding(AgentName.SECURITY),
|
||||
make_finding(AgentName.STYLE),
|
||||
],
|
||||
)
|
||||
|
||||
agents = group.agents
|
||||
assert len(agents) == 2
|
||||
assert AgentName.SECURITY in agents
|
||||
assert AgentName.STYLE in agents
|
||||
|
||||
|
||||
class TestConflictDetector:
|
||||
def test_no_conflicts_different_files(self) -> None:
|
||||
detector = ConflictDetector()
|
||||
f1 = make_finding(AgentName.SECURITY, file="a.py")
|
||||
f2 = make_finding(AgentName.STYLE, file="b.py")
|
||||
|
||||
conflicts = detector.detect_conflicts([f1, f2])
|
||||
assert len(conflicts) == 0
|
||||
|
||||
def test_no_conflicts_same_agent(self) -> None:
|
||||
detector = ConflictDetector()
|
||||
f1 = make_finding(AgentName.SECURITY, line_start=10)
|
||||
f2 = make_finding(AgentName.SECURITY, line_start=12)
|
||||
|
||||
conflicts = detector.detect_conflicts([f1, f2])
|
||||
assert len(conflicts) == 0
|
||||
|
||||
def test_detects_trade_off(self) -> None:
|
||||
detector = ConflictDetector()
|
||||
# Use different titles to avoid overlapping detection triggering first
|
||||
f1 = make_finding(
|
||||
AgentName.SECURITY, severity=Severity.HIGH, title="SQL injection vulnerability"
|
||||
)
|
||||
f2 = make_finding(
|
||||
AgentName.COMPLEXITY, severity=Severity.MEDIUM, title="Function too complex"
|
||||
)
|
||||
|
||||
conflicts = detector.detect_conflicts([f1, f2])
|
||||
|
||||
assert len(conflicts) == 1
|
||||
assert conflicts[0].nature == ConflictNature.TRADE_OFF
|
||||
assert "security" in conflicts[0].description.lower()
|
||||
assert "complexity" in conflicts[0].description.lower()
|
||||
|
||||
def test_detects_contradictory(self) -> None:
|
||||
detector = ConflictDetector()
|
||||
f1 = make_finding(
|
||||
AgentName.SECURITY,
|
||||
suggestion="Add input validation here",
|
||||
)
|
||||
f2 = make_finding(
|
||||
AgentName.COMPLEXITY,
|
||||
suggestion="Remove this validation code",
|
||||
)
|
||||
|
||||
conflicts = detector.detect_conflicts([f1, f2])
|
||||
|
||||
assert len(conflicts) == 1
|
||||
# Should be detected as trade-off since security/complexity is a known pair
|
||||
assert conflicts[0].nature in (ConflictNature.CONTRADICTORY, ConflictNature.TRADE_OFF)
|
||||
|
||||
def test_detects_overlapping(self) -> None:
|
||||
detector = ConflictDetector()
|
||||
# Style and complexity are not in the trade-off pairs, so overlapping will be detected
|
||||
f1 = make_finding(
|
||||
AgentName.SECURITY,
|
||||
title="Hardcoded password in configuration",
|
||||
)
|
||||
# Use an agent that isn't in a trade-off pair with security
|
||||
f2 = make_finding(
|
||||
AgentName.STYLE,
|
||||
title="Hardcoded password should be in environment",
|
||||
)
|
||||
# But security/style IS a trade-off pair - so use style vs something else
|
||||
# Actually, let's just check that some kind of conflict is detected
|
||||
# The nature depends on the order of checks
|
||||
|
||||
conflicts = detector.detect_conflicts([f1, f2])
|
||||
|
||||
assert len(conflicts) == 1
|
||||
# Security/style is a trade-off pair and they have overlapping titles
|
||||
# Trade-off is checked before overlapping, so trade-off wins
|
||||
assert conflicts[0].nature in (ConflictNature.TRADE_OFF, ConflictNature.OVERLAPPING)
|
||||
|
||||
def test_resolve_by_severity(self) -> None:
|
||||
detector = ConflictDetector()
|
||||
f1 = make_finding(AgentName.SECURITY, severity=Severity.HIGH)
|
||||
f2 = make_finding(AgentName.COMPLEXITY, severity=Severity.MEDIUM)
|
||||
|
||||
conflicts = detector.detect_conflicts([f1, f2])
|
||||
resolved = detector.resolve_by_severity(conflicts[0], [f1, f2])
|
||||
|
||||
assert resolved.winning_finding_id == f1.id
|
||||
assert "severity" in resolved.resolution.lower()
|
||||
|
||||
|
||||
class TestConflictSynthesizer:
|
||||
@pytest.mark.asyncio
|
||||
async def test_synthesize_returns_resolution(self) -> None:
|
||||
mock_response = """{
|
||||
"decision": "prefer_first",
|
||||
"reasoning": "Security takes priority over complexity",
|
||||
"merged_suggestion": null,
|
||||
"confidence": 0.85
|
||||
}"""
|
||||
mock_llm = MockLLMClient(responses=[mock_response])
|
||||
synthesizer = ConflictSynthesizer(mock_llm)
|
||||
|
||||
f1 = make_finding(AgentName.SECURITY, severity=Severity.HIGH)
|
||||
f2 = make_finding(AgentName.COMPLEXITY, severity=Severity.MEDIUM)
|
||||
conflict = Conflict(
|
||||
id="test-conflict",
|
||||
finding_ids=[f1.id, f2.id],
|
||||
nature=ConflictNature.TRADE_OFF,
|
||||
description="Test conflict",
|
||||
severity_weight=0.8,
|
||||
)
|
||||
|
||||
resolution = await synthesizer.synthesize(conflict, [f1, f2])
|
||||
|
||||
assert resolution.decision == "prefer_first"
|
||||
assert resolution.confidence == 0.85
|
||||
assert "security" in resolution.reasoning.lower()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_synthesize_handles_invalid_json(self) -> None:
|
||||
mock_llm = MockLLMClient(responses=["not valid json"])
|
||||
synthesizer = ConflictSynthesizer(mock_llm)
|
||||
|
||||
f1 = make_finding(AgentName.SECURITY, severity=Severity.HIGH)
|
||||
f2 = make_finding(AgentName.COMPLEXITY, severity=Severity.LOW)
|
||||
conflict = Conflict(
|
||||
id="test-conflict",
|
||||
finding_ids=[f1.id, f2.id],
|
||||
nature=ConflictNature.TRADE_OFF,
|
||||
description="Test conflict",
|
||||
severity_weight=0.8,
|
||||
)
|
||||
|
||||
resolution = await synthesizer.synthesize(conflict, [f1, f2])
|
||||
|
||||
# Should fall back to severity-based resolution
|
||||
assert resolution.decision == "prefer_first"
|
||||
assert "fallback" in resolution.reasoning.lower()
|
||||
|
||||
def test_should_synthesize_contradictory(self) -> None:
|
||||
synthesizer = ConflictSynthesizer(MockLLMClient())
|
||||
conflict = Conflict(
|
||||
id="test",
|
||||
finding_ids=["a", "b"],
|
||||
nature=ConflictNature.CONTRADICTORY,
|
||||
description="Test",
|
||||
severity_weight=0.5,
|
||||
)
|
||||
|
||||
assert synthesizer.should_synthesize(conflict) is True
|
||||
|
||||
def test_should_not_synthesize_overlapping(self) -> None:
|
||||
synthesizer = ConflictSynthesizer(MockLLMClient())
|
||||
conflict = Conflict(
|
||||
id="test",
|
||||
finding_ids=["a", "b"],
|
||||
nature=ConflictNature.OVERLAPPING,
|
||||
description="Test",
|
||||
severity_weight=0.5,
|
||||
)
|
||||
|
||||
assert synthesizer.should_synthesize(conflict) is False
|
||||
|
||||
|
||||
class TestCoordinator:
|
||||
@pytest.mark.asyncio
|
||||
async def test_deliberate_empty_results(self) -> None:
|
||||
coordinator = Coordinator()
|
||||
result = await coordinator.deliberate([], None)
|
||||
|
||||
assert result.verdict == Verdict.APPROVE
|
||||
assert result.total_findings == 0
|
||||
assert len(result.steps) > 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_deliberate_merges_findings(self) -> None:
|
||||
coordinator = Coordinator()
|
||||
|
||||
results = [
|
||||
ReviewResult(
|
||||
agent_name=AgentName.SECURITY,
|
||||
findings=[make_finding(AgentName.SECURITY)],
|
||||
duration_ms=100,
|
||||
tokens_used=1000,
|
||||
cost_usd=0.01,
|
||||
),
|
||||
ReviewResult(
|
||||
agent_name=AgentName.STYLE,
|
||||
findings=[make_finding(AgentName.STYLE, line_start=50)],
|
||||
duration_ms=100,
|
||||
tokens_used=1000,
|
||||
cost_usd=0.01,
|
||||
),
|
||||
]
|
||||
|
||||
result = await coordinator.deliberate(results)
|
||||
|
||||
assert result.total_findings == 2
|
||||
assert len(result.merged.groups) == 2
|
||||
assert any(s.step_type == StepType.MERGE for s in result.steps)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_deliberate_detects_conflicts(self) -> None:
|
||||
coordinator = Coordinator()
|
||||
|
||||
# Create findings at same location from different agents with different titles
|
||||
results = [
|
||||
ReviewResult(
|
||||
agent_name=AgentName.SECURITY,
|
||||
findings=[
|
||||
make_finding(
|
||||
AgentName.SECURITY, severity=Severity.HIGH, title="SQL injection risk"
|
||||
)
|
||||
],
|
||||
duration_ms=100,
|
||||
tokens_used=1000,
|
||||
cost_usd=0.01,
|
||||
),
|
||||
ReviewResult(
|
||||
agent_name=AgentName.COMPLEXITY,
|
||||
findings=[
|
||||
make_finding(
|
||||
AgentName.COMPLEXITY,
|
||||
severity=Severity.MEDIUM,
|
||||
title="Overly complex function",
|
||||
)
|
||||
],
|
||||
duration_ms=100,
|
||||
tokens_used=1000,
|
||||
cost_usd=0.01,
|
||||
),
|
||||
]
|
||||
|
||||
result = await coordinator.deliberate(results)
|
||||
|
||||
assert len(result.conflicts) > 0
|
||||
assert any(s.step_type == StepType.CONFLICT_DETECTION for s in result.steps)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_verdict_critical_requests_changes(self) -> None:
|
||||
coordinator = Coordinator()
|
||||
|
||||
results = [
|
||||
ReviewResult(
|
||||
agent_name=AgentName.SECURITY,
|
||||
findings=[make_finding(AgentName.SECURITY, severity=Severity.CRITICAL)],
|
||||
duration_ms=100,
|
||||
tokens_used=1000,
|
||||
cost_usd=0.01,
|
||||
),
|
||||
]
|
||||
|
||||
result = await coordinator.deliberate(results)
|
||||
|
||||
assert result.verdict == Verdict.REQUEST_CHANGES
|
||||
assert result.critical_count == 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_verdict_multiple_high_requests_changes(self) -> None:
|
||||
coordinator = Coordinator()
|
||||
|
||||
results = [
|
||||
ReviewResult(
|
||||
agent_name=AgentName.SECURITY,
|
||||
findings=[
|
||||
make_finding(AgentName.SECURITY, severity=Severity.HIGH, line_start=10),
|
||||
make_finding(AgentName.SECURITY, severity=Severity.HIGH, line_start=20),
|
||||
make_finding(AgentName.SECURITY, severity=Severity.HIGH, line_start=30),
|
||||
],
|
||||
duration_ms=100,
|
||||
tokens_used=1000,
|
||||
cost_usd=0.01,
|
||||
),
|
||||
]
|
||||
|
||||
result = await coordinator.deliberate(results)
|
||||
|
||||
assert result.verdict == Verdict.REQUEST_CHANGES
|
||||
assert result.high_count == 3
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_verdict_low_severity_approves(self) -> None:
|
||||
coordinator = Coordinator()
|
||||
|
||||
results = [
|
||||
ReviewResult(
|
||||
agent_name=AgentName.STYLE,
|
||||
findings=[
|
||||
make_finding(AgentName.STYLE, severity=Severity.LOW, line_start=10),
|
||||
make_finding(AgentName.STYLE, severity=Severity.INFO, line_start=20),
|
||||
],
|
||||
duration_ms=100,
|
||||
tokens_used=1000,
|
||||
cost_usd=0.01,
|
||||
),
|
||||
]
|
||||
|
||||
result = await coordinator.deliberate(results)
|
||||
|
||||
assert result.verdict == Verdict.APPROVE
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_deliberation_steps_logged(self) -> None:
|
||||
coordinator = Coordinator()
|
||||
|
||||
results = [
|
||||
ReviewResult(
|
||||
agent_name=AgentName.SECURITY,
|
||||
findings=[make_finding(AgentName.SECURITY)],
|
||||
duration_ms=100,
|
||||
tokens_used=1000,
|
||||
cost_usd=0.01,
|
||||
),
|
||||
]
|
||||
|
||||
result = await coordinator.deliberate(results)
|
||||
|
||||
step_types = [s.step_type for s in result.steps]
|
||||
assert StepType.MERGE in step_types
|
||||
assert StepType.CONFLICT_DETECTION in step_types
|
||||
assert StepType.VERDICT in step_types
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_verdict_medium_count_comments(self) -> None:
|
||||
coordinator = Coordinator()
|
||||
|
||||
results = [
|
||||
ReviewResult(
|
||||
agent_name=AgentName.STYLE,
|
||||
findings=[
|
||||
make_finding(
|
||||
AgentName.STYLE,
|
||||
severity=Severity.MEDIUM,
|
||||
line_start=(i + 1) * 10,
|
||||
title=f"Issue {i}",
|
||||
)
|
||||
for i in range(5)
|
||||
],
|
||||
duration_ms=100,
|
||||
tokens_used=1000,
|
||||
cost_usd=0.01,
|
||||
),
|
||||
]
|
||||
|
||||
result = await coordinator.deliberate(results)
|
||||
|
||||
assert result.verdict == Verdict.COMMENT
|
||||
assert "medium" in result.verdict_reasoning.lower()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_verdict_single_high_comments(self) -> None:
|
||||
coordinator = Coordinator()
|
||||
|
||||
results = [
|
||||
ReviewResult(
|
||||
agent_name=AgentName.SECURITY,
|
||||
findings=[
|
||||
make_finding(AgentName.SECURITY, severity=Severity.HIGH),
|
||||
],
|
||||
duration_ms=100,
|
||||
tokens_used=1000,
|
||||
cost_usd=0.01,
|
||||
),
|
||||
]
|
||||
|
||||
result = await coordinator.deliberate(results)
|
||||
|
||||
assert result.verdict == Verdict.COMMENT
|
||||
assert result.high_count == 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_deliberate_with_synthesis(self) -> None:
|
||||
mock_response = """{
|
||||
"decision": "prefer_first",
|
||||
"reasoning": "Security takes priority",
|
||||
"merged_suggestion": null,
|
||||
"confidence": 0.85
|
||||
}"""
|
||||
mock_llm = MockLLMClient(responses=[mock_response])
|
||||
coordinator = Coordinator(llm_client=mock_llm)
|
||||
|
||||
# Create findings at same location from different agents
|
||||
results = [
|
||||
ReviewResult(
|
||||
agent_name=AgentName.SECURITY,
|
||||
findings=[
|
||||
make_finding(
|
||||
AgentName.SECURITY,
|
||||
severity=Severity.HIGH,
|
||||
title="Security vulnerability",
|
||||
suggestion="Add validation",
|
||||
)
|
||||
],
|
||||
duration_ms=100,
|
||||
tokens_used=1000,
|
||||
cost_usd=0.01,
|
||||
),
|
||||
ReviewResult(
|
||||
agent_name=AgentName.COMPLEXITY,
|
||||
findings=[
|
||||
make_finding(
|
||||
AgentName.COMPLEXITY,
|
||||
severity=Severity.MEDIUM,
|
||||
title="Complex function",
|
||||
suggestion="Remove validation",
|
||||
)
|
||||
],
|
||||
duration_ms=100,
|
||||
tokens_used=1000,
|
||||
cost_usd=0.01,
|
||||
),
|
||||
]
|
||||
|
||||
result = await coordinator.deliberate(results)
|
||||
|
||||
assert len(result.conflicts) > 0
|
||||
# Synthesis step should be logged
|
||||
assert any(s.step_type == StepType.SYNTHESIS for s in result.steps)
|
||||
|
||||
|
||||
class TestConflictDetectorEdgeCases:
|
||||
def test_no_conflicts_with_no_overlap(self) -> None:
|
||||
detector = ConflictDetector()
|
||||
f1 = make_finding(AgentName.SECURITY, line_start=10, line_end=15)
|
||||
f2 = make_finding(AgentName.STYLE, line_start=100, line_end=105)
|
||||
|
||||
conflicts = detector.detect_conflicts([f1, f2])
|
||||
assert len(conflicts) == 0
|
||||
|
||||
def test_overlap_no_title_match(self) -> None:
|
||||
detector = ConflictDetector()
|
||||
# These agents are in TRADE_OFF_PAIRS, so will be detected as trade-off
|
||||
f1 = make_finding(
|
||||
AgentName.SECURITY,
|
||||
title="Unique security title",
|
||||
)
|
||||
f2 = make_finding(
|
||||
AgentName.STYLE,
|
||||
title="Completely different style concern",
|
||||
)
|
||||
|
||||
conflicts = detector.detect_conflicts([f1, f2])
|
||||
assert len(conflicts) == 1
|
||||
# Security/Style is a trade-off pair
|
||||
assert conflicts[0].nature == ConflictNature.TRADE_OFF
|
||||
|
||||
def test_resolve_empty_findings(self) -> None:
|
||||
detector = ConflictDetector()
|
||||
conflict = Conflict(
|
||||
id="test",
|
||||
finding_ids=["nonexistent1", "nonexistent2"],
|
||||
nature=ConflictNature.TRADE_OFF,
|
||||
description="Test",
|
||||
severity_weight=0.5,
|
||||
)
|
||||
|
||||
resolved = detector.resolve_by_severity(conflict, [])
|
||||
assert resolved.winning_finding_id is None
|
||||
|
||||
|
||||
class TestConflictSynthesizerEdgeCases:
|
||||
@pytest.mark.asyncio
|
||||
async def test_synthesize_missing_findings(self) -> None:
|
||||
mock_llm = MockLLMClient()
|
||||
synthesizer = ConflictSynthesizer(mock_llm)
|
||||
|
||||
conflict = Conflict(
|
||||
id="test",
|
||||
finding_ids=["nonexistent1", "nonexistent2"],
|
||||
nature=ConflictNature.CONTRADICTORY,
|
||||
description="Test",
|
||||
severity_weight=0.8,
|
||||
)
|
||||
|
||||
resolution = await synthesizer.synthesize(conflict, [])
|
||||
|
||||
assert resolution.decision == "keep_both"
|
||||
assert "Could not find" in resolution.reasoning
|
||||
|
||||
def test_synthesize_low_severity(self) -> None:
|
||||
synthesizer = ConflictSynthesizer(MockLLMClient())
|
||||
conflict = Conflict(
|
||||
id="test",
|
||||
finding_ids=["a", "b"],
|
||||
nature=ConflictNature.TRADE_OFF,
|
||||
description="Test",
|
||||
severity_weight=0.5, # Below 0.7 threshold
|
||||
)
|
||||
|
||||
assert synthesizer.should_synthesize(conflict) is False
|
||||
|
||||
def test_synthesize_high_severity(self) -> None:
|
||||
synthesizer = ConflictSynthesizer(MockLLMClient())
|
||||
conflict = Conflict(
|
||||
id="test",
|
||||
finding_ids=["a", "b"],
|
||||
nature=ConflictNature.TRADE_OFF,
|
||||
description="Test",
|
||||
severity_weight=0.8, # Above 0.7 threshold
|
||||
)
|
||||
|
||||
assert synthesizer.should_synthesize(conflict) is True
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_synthesize_fallback_prefer_second(self) -> None:
|
||||
mock_llm = MockLLMClient(responses=["not valid json"])
|
||||
synthesizer = ConflictSynthesizer(mock_llm)
|
||||
|
||||
f1 = make_finding(AgentName.STYLE, severity=Severity.LOW)
|
||||
f2 = make_finding(AgentName.SECURITY, severity=Severity.HIGH)
|
||||
conflict = Conflict(
|
||||
id="test-conflict",
|
||||
finding_ids=[f1.id, f2.id],
|
||||
nature=ConflictNature.CONTRADICTORY,
|
||||
description="Test conflict",
|
||||
severity_weight=0.8,
|
||||
)
|
||||
|
||||
resolution = await synthesizer.synthesize(conflict, [f1, f2])
|
||||
|
||||
assert resolution.decision == "prefer_second"
|
||||
assert "fallback" in resolution.reasoning.lower()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_synthesize_fallback_equal_severity(self) -> None:
|
||||
mock_llm = MockLLMClient(responses=["not valid json"])
|
||||
synthesizer = ConflictSynthesizer(mock_llm)
|
||||
|
||||
f1 = make_finding(AgentName.STYLE, severity=Severity.MEDIUM)
|
||||
f2 = make_finding(AgentName.SECURITY, severity=Severity.MEDIUM)
|
||||
conflict = Conflict(
|
||||
id="test-conflict",
|
||||
finding_ids=[f1.id, f2.id],
|
||||
nature=ConflictNature.CONTRADICTORY,
|
||||
description="Test conflict",
|
||||
severity_weight=0.8,
|
||||
)
|
||||
|
||||
resolution = await synthesizer.synthesize(conflict, [f1, f2])
|
||||
|
||||
assert resolution.decision == "keep_both"
|
||||
assert "equal severity" in resolution.reasoning.lower()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_synthesize_parse_json_in_code_block(self) -> None:
|
||||
mock_response = """Here is my analysis:
|
||||
```json
|
||||
{
|
||||
"decision": "merge",
|
||||
"reasoning": "Both concerns valid",
|
||||
"merged_suggestion": "Do both things",
|
||||
"confidence": 0.9
|
||||
}
|
||||
```
|
||||
"""
|
||||
mock_llm = MockLLMClient(responses=[mock_response])
|
||||
synthesizer = ConflictSynthesizer(mock_llm)
|
||||
|
||||
f1 = make_finding(AgentName.SECURITY)
|
||||
f2 = make_finding(AgentName.COMPLEXITY)
|
||||
conflict = Conflict(
|
||||
id="test-conflict",
|
||||
finding_ids=[f1.id, f2.id],
|
||||
nature=ConflictNature.CONTRADICTORY,
|
||||
description="Test",
|
||||
severity_weight=0.8,
|
||||
)
|
||||
|
||||
resolution = await synthesizer.synthesize(conflict, [f1, f2])
|
||||
|
||||
assert resolution.decision == "merge"
|
||||
assert resolution.merged_suggestion == "Do both things"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_synthesize_parse_plain_json(self) -> None:
|
||||
mock_response = """{
|
||||
"decision": "prefer_second",
|
||||
"reasoning": "Second is better",
|
||||
"confidence": 0.75
|
||||
}"""
|
||||
mock_llm = MockLLMClient(responses=[mock_response])
|
||||
synthesizer = ConflictSynthesizer(mock_llm)
|
||||
|
||||
f1 = make_finding(AgentName.SECURITY)
|
||||
f2 = make_finding(AgentName.COMPLEXITY)
|
||||
conflict = Conflict(
|
||||
id="test-conflict",
|
||||
finding_ids=[f1.id, f2.id],
|
||||
nature=ConflictNature.CONTRADICTORY,
|
||||
description="Test",
|
||||
severity_weight=0.8,
|
||||
)
|
||||
|
||||
resolution = await synthesizer.synthesize(conflict, [f1, f2])
|
||||
|
||||
assert resolution.decision == "prefer_second"
|
||||
assert resolution.confidence == 0.75
|
||||
|
||||
|
||||
class TestFindingMergerEdgeCases:
|
||||
def test_merge_different_files(self) -> None:
|
||||
merger = FindingMerger()
|
||||
f1 = make_finding(AgentName.SECURITY, file="a.py", line_start=10)
|
||||
f2 = make_finding(AgentName.SECURITY, file="b.py", line_start=10)
|
||||
|
||||
result = merger.merge([f1, f2], None)
|
||||
|
||||
assert len(result.groups) == 2
|
||||
assert len(result.unique_findings) == 2
|
||||
|
||||
def test_finding_group_empty(self) -> None:
|
||||
group = FindingGroup(
|
||||
file="test.py",
|
||||
line_start=10,
|
||||
line_end=20,
|
||||
findings=[],
|
||||
)
|
||||
|
||||
assert group.primary_finding is None
|
||||
assert group.agents == []
|
||||
1344
tests/test_static_analysis.py
Normal file
1344
tests/test_static_analysis.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user