regression detection logic

Rolling window baseline computation and statistical regression detection.
2025-04-19 11:13:01 +00:00
parent e6f55c2781
commit 32fec2b6d5
1 changed files with 82 additions and 0 deletions
@@ -0,0 +1,82 @@
 """Regression detection using rolling window comparison."""
 from veritext.benchmark.models import BenchmarkRun, RegressionReport
 def compute_baseline(
    runs: list[BenchmarkRun],
    window: int = 10,
 ) -> dict[str, float]:
    """
    Compute rolling average baseline from recent runs.
    Args:
        runs: List of benchmark runs (most recent first).
        window: Number of runs to include in the baseline.
    Returns:
        Dictionary of metric names to their average values.
    """
    if not runs:
        return {}
    recent_runs = runs[:window]
    metric_values: dict[str, list[float]] = {}
    for run in recent_runs:
        for metric_name, value in run.metrics.items():
            if metric_name not in metric_values:
                metric_values[metric_name] = []
            metric_values[metric_name].append(value)
    return {
        metric: sum(values) / len(values) for metric, values in metric_values.items()
    }
 def detect_regression(
    current: dict[str, float],
    baseline: dict[str, float],
    tolerance: float = 0.05,
 ) -> RegressionReport:
    """
    Compare current metrics against baseline.
    A regression is detected if any metric drops by more than the tolerance
    threshold (relative to its baseline value).
    Args:
        current: Current metric values.
        baseline: Baseline metric values.
        tolerance: Maximum allowed drop before regression is flagged (e.g., 0.05 = 5%).
    Returns:
        RegressionReport with comparison results.
    """
    if not baseline:
        return RegressionReport(
            detected=False,
            baseline=baseline,
            current=current,
            deltas={},
            tolerance=tolerance,
        )
    deltas: dict[str, float] = {}
    detected = False
    for metric, baseline_value in baseline.items():
        current_value = current.get(metric, 0.0)
        delta = current_value - baseline_value
        deltas[metric] = delta
        if delta < -tolerance:
            detected = True
    return RegressionReport(
        detected=detected,
        baseline=baseline,
        current=current,
        deltas=deltas,
        tolerance=tolerance,
    )