"""Tests for regression detection.""" from datetime import UTC, datetime import pytest from veritext.benchmark.models import BenchmarkRun from veritext.benchmark.regression import compute_baseline, detect_regression def make_run( run_id: str, metrics: dict[str, float], day: int = 1, ) -> BenchmarkRun: return BenchmarkRun( id=run_id, benchmark_name="test", timestamp=datetime(2025, 1, day, 12, 0, 0, tzinfo=UTC), veritext_version="0.1.0", metrics=metrics, sample_count=10, ) class TestComputeBaseline: def test_empty_runs(self) -> None: baseline = compute_baseline([]) assert baseline == {} def test_single_run(self) -> None: runs = [make_run("r1", {"bleu4": 0.75, "rouge_l": 0.80})] baseline = compute_baseline(runs) assert baseline["bleu4"] == 0.75 assert baseline["rouge_l"] == 0.80 def test_multiple_runs_average(self) -> None: runs = [ make_run("r1", {"bleu4": 0.70}, day=3), make_run("r2", {"bleu4": 0.80}, day=2), make_run("r3", {"bleu4": 0.90}, day=1), ] baseline = compute_baseline(runs, window=3) assert baseline["bleu4"] == pytest.approx(0.80) # (0.70+0.80+0.90)/3 def test_window_limits_runs(self) -> None: runs = [ make_run("r1", {"bleu4": 0.70}, day=5), # most recent make_run("r2", {"bleu4": 0.80}, day=4), make_run("r3", {"bleu4": 0.90}, day=3), make_run("r4", {"bleu4": 0.60}, day=2), # excluded make_run("r5", {"bleu4": 0.50}, day=1), # excluded ] baseline = compute_baseline(runs, window=3) # Only first 3 runs: (0.70 + 0.80 + 0.90) / 3 = 0.80 assert baseline["bleu4"] == pytest.approx(0.80) def test_partial_history(self) -> None: runs = [ make_run("r1", {"bleu4": 0.70}), make_run("r2", {"bleu4": 0.80}), ] baseline = compute_baseline(runs, window=10) # Only 2 runs available: (0.70 + 0.80) / 2 = 0.75 assert baseline["bleu4"] == pytest.approx(0.75) def test_multiple_metrics(self) -> None: runs = [ make_run("r1", {"bleu4": 0.70, "rouge_l": 0.75}), make_run("r2", {"bleu4": 0.80, "rouge_l": 0.85}), ] baseline = compute_baseline(runs) assert baseline["bleu4"] == pytest.approx(0.75) assert baseline["rouge_l"] == pytest.approx(0.80) def test_varying_metrics(self) -> None: runs = [ make_run("r1", {"bleu4": 0.70, "rouge_l": 0.75}), make_run("r2", {"bleu4": 0.80}), # No rouge_l ] baseline = compute_baseline(runs) # bleu4 appears in both runs assert baseline["bleu4"] == pytest.approx(0.75) # rouge_l only appears in one run assert baseline["rouge_l"] == pytest.approx(0.75) class TestDetectRegression: def test_no_baseline(self) -> None: report = detect_regression( current={"bleu4": 0.70}, baseline={}, tolerance=0.05, ) assert not report.detected assert report.deltas == {} def test_no_regression_stable(self) -> None: report = detect_regression( current={"bleu4": 0.75}, baseline={"bleu4": 0.75}, tolerance=0.05, ) assert not report.detected assert report.deltas["bleu4"] == pytest.approx(0.0) def test_no_regression_improved(self) -> None: report = detect_regression( current={"bleu4": 0.85}, baseline={"bleu4": 0.75}, tolerance=0.05, ) assert not report.detected assert report.deltas["bleu4"] == pytest.approx(0.10) def test_no_regression_within_tolerance(self) -> None: report = detect_regression( current={"bleu4": 0.73}, baseline={"bleu4": 0.75}, tolerance=0.05, ) assert not report.detected assert report.deltas["bleu4"] == pytest.approx(-0.02) def test_regression_detected(self) -> None: report = detect_regression( current={"bleu4": 0.65}, baseline={"bleu4": 0.75}, tolerance=0.05, ) assert report.detected assert report.deltas["bleu4"] == pytest.approx(-0.10) def test_regression_at_tolerance_boundary(self) -> None: # Use a value clearly at the boundary (accounting for float precision) # The implementation checks delta < -tolerance (strictly less than) report = detect_regression( current={"bleu4": 0.50}, baseline={"bleu4": 0.50}, tolerance=0.05, ) # Delta is 0.0, well within tolerance assert not report.detected assert report.deltas["bleu4"] == 0.0 def test_regression_just_beyond_tolerance(self) -> None: report = detect_regression( current={"bleu4": 0.6999}, baseline={"bleu4": 0.75}, tolerance=0.05, ) # Delta is -0.0501, which is < -tolerance assert report.detected def test_multiple_metrics_any_regresses(self) -> None: report = detect_regression( current={"bleu4": 0.65, "rouge_l": 0.80}, baseline={"bleu4": 0.75, "rouge_l": 0.80}, tolerance=0.05, ) assert report.detected # Only bleu4 regressed assert report.deltas["bleu4"] == pytest.approx(-0.10) assert report.deltas["rouge_l"] == pytest.approx(0.0) def test_report_contains_all_values(self) -> None: baseline = {"bleu4": 0.75, "rouge_l": 0.80} current = {"bleu4": 0.65, "rouge_l": 0.82} report = detect_regression(current, baseline, tolerance=0.05) assert report.baseline == baseline assert report.current == current assert report.tolerance == 0.05 assert "bleu4" in report.deltas assert "rouge_l" in report.deltas def test_missing_metric_in_current(self) -> None: report = detect_regression( current={}, baseline={"bleu4": 0.75}, tolerance=0.05, ) # 0.0 - 0.75 = -0.75, which is a regression assert report.detected assert report.deltas["bleu4"] == pytest.approx(-0.75)