Files
veritext/src/veritext/benchmark/regression.py
Kai Chappell 83c4b4bee5 feat(benchmark): add regression detection
Rolling window baseline computation and statistical regression detection.
2026-02-03 18:09:55 +00:00

88 lines
2.3 KiB
Python

"""Regression detection using rolling window comparison."""
from veritext.benchmark.models import BenchmarkRun, RegressionReport
def compute_baseline(
runs: list[BenchmarkRun],
window: int = 10,
) -> dict[str, float]:
"""
Compute rolling average baseline from recent runs.
Args:
runs: List of benchmark runs (most recent first).
window: Number of runs to include in the baseline.
Returns:
Dictionary of metric names to their average values.
"""
if not runs:
return {}
# Take up to `window` runs
recent_runs = runs[:window]
# Collect all metric values
metric_values: dict[str, list[float]] = {}
for run in recent_runs:
for metric_name, value in run.metrics.items():
if metric_name not in metric_values:
metric_values[metric_name] = []
metric_values[metric_name].append(value)
# Compute averages
return {
metric: sum(values) / len(values) for metric, values in metric_values.items()
}
def detect_regression(
current: dict[str, float],
baseline: dict[str, float],
tolerance: float = 0.05,
) -> RegressionReport:
"""
Compare current metrics against baseline.
A regression is detected if any metric drops by more than the tolerance
threshold (relative to its baseline value).
Args:
current: Current metric values.
baseline: Baseline metric values.
tolerance: Maximum allowed drop before regression is flagged (e.g., 0.05 = 5%).
Returns:
RegressionReport with comparison results.
"""
if not baseline:
# No baseline means no regression possible
return RegressionReport(
detected=False,
baseline=baseline,
current=current,
deltas={},
tolerance=tolerance,
)
deltas: dict[str, float] = {}
detected = False
for metric, baseline_value in baseline.items():
current_value = current.get(metric, 0.0)
delta = current_value - baseline_value
deltas[metric] = delta
# Check if this metric regressed beyond tolerance
if delta < -tolerance:
detected = True
return RegressionReport(
detected=detected,
baseline=baseline,
current=current,
deltas=deltas,
tolerance=tolerance,
)