regression detection logic

Rolling window baseline computation and statistical regression detection.
This commit is contained in:
2025-04-19 11:13:01 +00:00
parent e6f55c2781
commit 32fec2b6d5

View File

@@ -0,0 +1,82 @@
"""Regression detection using rolling window comparison."""
from veritext.benchmark.models import BenchmarkRun, RegressionReport
def compute_baseline(
runs: list[BenchmarkRun],
window: int = 10,
) -> dict[str, float]:
"""
Compute rolling average baseline from recent runs.
Args:
runs: List of benchmark runs (most recent first).
window: Number of runs to include in the baseline.
Returns:
Dictionary of metric names to their average values.
"""
if not runs:
return {}
recent_runs = runs[:window]
metric_values: dict[str, list[float]] = {}
for run in recent_runs:
for metric_name, value in run.metrics.items():
if metric_name not in metric_values:
metric_values[metric_name] = []
metric_values[metric_name].append(value)
return {
metric: sum(values) / len(values) for metric, values in metric_values.items()
}
def detect_regression(
current: dict[str, float],
baseline: dict[str, float],
tolerance: float = 0.05,
) -> RegressionReport:
"""
Compare current metrics against baseline.
A regression is detected if any metric drops by more than the tolerance
threshold (relative to its baseline value).
Args:
current: Current metric values.
baseline: Baseline metric values.
tolerance: Maximum allowed drop before regression is flagged (e.g., 0.05 = 5%).
Returns:
RegressionReport with comparison results.
"""
if not baseline:
return RegressionReport(
detected=False,
baseline=baseline,
current=current,
deltas={},
tolerance=tolerance,
)
deltas: dict[str, float] = {}
detected = False
for metric, baseline_value in baseline.items():
current_value = current.get(metric, 0.0)
delta = current_value - baseline_value
deltas[metric] = delta
if delta < -tolerance:
detected = True
return RegressionReport(
detected=detected,
baseline=baseline,
current=current,
deltas=deltas,
tolerance=tolerance,
)