diff --git a/src/veritext/benchmark/regression.py b/src/veritext/benchmark/regression.py new file mode 100644 index 0000000..cb3ed7b --- /dev/null +++ b/src/veritext/benchmark/regression.py @@ -0,0 +1,82 @@ +"""Regression detection using rolling window comparison.""" + +from veritext.benchmark.models import BenchmarkRun, RegressionReport + + +def compute_baseline( + runs: list[BenchmarkRun], + window: int = 10, +) -> dict[str, float]: + """ + Compute rolling average baseline from recent runs. + + Args: + runs: List of benchmark runs (most recent first). + window: Number of runs to include in the baseline. + + Returns: + Dictionary of metric names to their average values. + """ + if not runs: + return {} + + recent_runs = runs[:window] + + metric_values: dict[str, list[float]] = {} + for run in recent_runs: + for metric_name, value in run.metrics.items(): + if metric_name not in metric_values: + metric_values[metric_name] = [] + metric_values[metric_name].append(value) + + return { + metric: sum(values) / len(values) for metric, values in metric_values.items() + } + + +def detect_regression( + current: dict[str, float], + baseline: dict[str, float], + tolerance: float = 0.05, +) -> RegressionReport: + """ + Compare current metrics against baseline. + + A regression is detected if any metric drops by more than the tolerance + threshold (relative to its baseline value). + + Args: + current: Current metric values. + baseline: Baseline metric values. + tolerance: Maximum allowed drop before regression is flagged (e.g., 0.05 = 5%). + + Returns: + RegressionReport with comparison results. + """ + if not baseline: + return RegressionReport( + detected=False, + baseline=baseline, + current=current, + deltas={}, + tolerance=tolerance, + ) + + deltas: dict[str, float] = {} + detected = False + + for metric, baseline_value in baseline.items(): + current_value = current.get(metric, 0.0) + delta = current_value - baseline_value + deltas[metric] = delta + + if delta < -tolerance: + detected = True + + return RegressionReport( + detected=detected, + baseline=baseline, + current=current, + deltas=deltas, + tolerance=tolerance, + )