regression detection logic
Rolling window baseline computation and statistical regression detection.
This commit is contained in:
82
src/veritext/benchmark/regression.py
Normal file
82
src/veritext/benchmark/regression.py
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
"""Regression detection using rolling window comparison."""
|
||||||
|
|
||||||
|
from veritext.benchmark.models import BenchmarkRun, RegressionReport
|
||||||
|
|
||||||
|
|
||||||
|
def compute_baseline(
|
||||||
|
runs: list[BenchmarkRun],
|
||||||
|
window: int = 10,
|
||||||
|
) -> dict[str, float]:
|
||||||
|
"""
|
||||||
|
Compute rolling average baseline from recent runs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
runs: List of benchmark runs (most recent first).
|
||||||
|
window: Number of runs to include in the baseline.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of metric names to their average values.
|
||||||
|
"""
|
||||||
|
if not runs:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
recent_runs = runs[:window]
|
||||||
|
|
||||||
|
metric_values: dict[str, list[float]] = {}
|
||||||
|
for run in recent_runs:
|
||||||
|
for metric_name, value in run.metrics.items():
|
||||||
|
if metric_name not in metric_values:
|
||||||
|
metric_values[metric_name] = []
|
||||||
|
metric_values[metric_name].append(value)
|
||||||
|
|
||||||
|
return {
|
||||||
|
metric: sum(values) / len(values) for metric, values in metric_values.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def detect_regression(
|
||||||
|
current: dict[str, float],
|
||||||
|
baseline: dict[str, float],
|
||||||
|
tolerance: float = 0.05,
|
||||||
|
) -> RegressionReport:
|
||||||
|
"""
|
||||||
|
Compare current metrics against baseline.
|
||||||
|
|
||||||
|
A regression is detected if any metric drops by more than the tolerance
|
||||||
|
threshold (relative to its baseline value).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
current: Current metric values.
|
||||||
|
baseline: Baseline metric values.
|
||||||
|
tolerance: Maximum allowed drop before regression is flagged (e.g., 0.05 = 5%).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
RegressionReport with comparison results.
|
||||||
|
"""
|
||||||
|
if not baseline:
|
||||||
|
return RegressionReport(
|
||||||
|
detected=False,
|
||||||
|
baseline=baseline,
|
||||||
|
current=current,
|
||||||
|
deltas={},
|
||||||
|
tolerance=tolerance,
|
||||||
|
)
|
||||||
|
|
||||||
|
deltas: dict[str, float] = {}
|
||||||
|
detected = False
|
||||||
|
|
||||||
|
for metric, baseline_value in baseline.items():
|
||||||
|
current_value = current.get(metric, 0.0)
|
||||||
|
delta = current_value - baseline_value
|
||||||
|
deltas[metric] = delta
|
||||||
|
|
||||||
|
if delta < -tolerance:
|
||||||
|
detected = True
|
||||||
|
|
||||||
|
return RegressionReport(
|
||||||
|
detected=detected,
|
||||||
|
baseline=baseline,
|
||||||
|
current=current,
|
||||||
|
deltas=deltas,
|
||||||
|
tolerance=tolerance,
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user