208 lines
6.3 KiB
Python
208 lines
6.3 KiB
Python
"""Tests for regression detection."""
|
|
|
|
from datetime import UTC, datetime
|
|
|
|
import pytest
|
|
|
|
from veritext.benchmark.models import BenchmarkRun
|
|
from veritext.benchmark.regression import compute_baseline, detect_regression
|
|
|
|
|
|
def make_run(
|
|
run_id: str,
|
|
metrics: dict[str, float],
|
|
day: int = 1,
|
|
) -> BenchmarkRun:
|
|
return BenchmarkRun(
|
|
id=run_id,
|
|
benchmark_name="test",
|
|
timestamp=datetime(2025, 1, day, 12, 0, 0, tzinfo=UTC),
|
|
veritext_version="0.1.0",
|
|
metrics=metrics,
|
|
sample_count=10,
|
|
)
|
|
|
|
|
|
class TestComputeBaseline:
|
|
def test_empty_runs(self) -> None:
|
|
baseline = compute_baseline([])
|
|
assert baseline == {}
|
|
|
|
def test_single_run(self) -> None:
|
|
runs = [make_run("r1", {"bleu4": 0.75, "rouge_l": 0.80})]
|
|
|
|
baseline = compute_baseline(runs)
|
|
|
|
assert baseline["bleu4"] == 0.75
|
|
assert baseline["rouge_l"] == 0.80
|
|
|
|
def test_multiple_runs_average(self) -> None:
|
|
runs = [
|
|
make_run("r1", {"bleu4": 0.70}, day=3),
|
|
make_run("r2", {"bleu4": 0.80}, day=2),
|
|
make_run("r3", {"bleu4": 0.90}, day=1),
|
|
]
|
|
|
|
baseline = compute_baseline(runs, window=3)
|
|
|
|
assert baseline["bleu4"] == pytest.approx(0.80) # (0.70+0.80+0.90)/3
|
|
|
|
def test_window_limits_runs(self) -> None:
|
|
runs = [
|
|
make_run("r1", {"bleu4": 0.70}, day=5), # most recent
|
|
make_run("r2", {"bleu4": 0.80}, day=4),
|
|
make_run("r3", {"bleu4": 0.90}, day=3),
|
|
make_run("r4", {"bleu4": 0.60}, day=2), # excluded
|
|
make_run("r5", {"bleu4": 0.50}, day=1), # excluded
|
|
]
|
|
|
|
baseline = compute_baseline(runs, window=3)
|
|
|
|
# Only first 3 runs: (0.70 + 0.80 + 0.90) / 3 = 0.80
|
|
assert baseline["bleu4"] == pytest.approx(0.80)
|
|
|
|
def test_partial_history(self) -> None:
|
|
runs = [
|
|
make_run("r1", {"bleu4": 0.70}),
|
|
make_run("r2", {"bleu4": 0.80}),
|
|
]
|
|
|
|
baseline = compute_baseline(runs, window=10)
|
|
|
|
# Only 2 runs available: (0.70 + 0.80) / 2 = 0.75
|
|
assert baseline["bleu4"] == pytest.approx(0.75)
|
|
|
|
def test_multiple_metrics(self) -> None:
|
|
runs = [
|
|
make_run("r1", {"bleu4": 0.70, "rouge_l": 0.75}),
|
|
make_run("r2", {"bleu4": 0.80, "rouge_l": 0.85}),
|
|
]
|
|
|
|
baseline = compute_baseline(runs)
|
|
|
|
assert baseline["bleu4"] == pytest.approx(0.75)
|
|
assert baseline["rouge_l"] == pytest.approx(0.80)
|
|
|
|
def test_varying_metrics(self) -> None:
|
|
runs = [
|
|
make_run("r1", {"bleu4": 0.70, "rouge_l": 0.75}),
|
|
make_run("r2", {"bleu4": 0.80}), # No rouge_l
|
|
]
|
|
|
|
baseline = compute_baseline(runs)
|
|
|
|
# bleu4 appears in both runs
|
|
assert baseline["bleu4"] == pytest.approx(0.75)
|
|
# rouge_l only appears in one run
|
|
assert baseline["rouge_l"] == pytest.approx(0.75)
|
|
|
|
|
|
class TestDetectRegression:
|
|
def test_no_baseline(self) -> None:
|
|
report = detect_regression(
|
|
current={"bleu4": 0.70},
|
|
baseline={},
|
|
tolerance=0.05,
|
|
)
|
|
|
|
assert not report.detected
|
|
assert report.deltas == {}
|
|
|
|
def test_no_regression_stable(self) -> None:
|
|
report = detect_regression(
|
|
current={"bleu4": 0.75},
|
|
baseline={"bleu4": 0.75},
|
|
tolerance=0.05,
|
|
)
|
|
|
|
assert not report.detected
|
|
assert report.deltas["bleu4"] == pytest.approx(0.0)
|
|
|
|
def test_no_regression_improved(self) -> None:
|
|
report = detect_regression(
|
|
current={"bleu4": 0.85},
|
|
baseline={"bleu4": 0.75},
|
|
tolerance=0.05,
|
|
)
|
|
|
|
assert not report.detected
|
|
assert report.deltas["bleu4"] == pytest.approx(0.10)
|
|
|
|
def test_no_regression_within_tolerance(self) -> None:
|
|
report = detect_regression(
|
|
current={"bleu4": 0.73},
|
|
baseline={"bleu4": 0.75},
|
|
tolerance=0.05,
|
|
)
|
|
|
|
assert not report.detected
|
|
assert report.deltas["bleu4"] == pytest.approx(-0.02)
|
|
|
|
def test_regression_detected(self) -> None:
|
|
report = detect_regression(
|
|
current={"bleu4": 0.65},
|
|
baseline={"bleu4": 0.75},
|
|
tolerance=0.05,
|
|
)
|
|
|
|
assert report.detected
|
|
assert report.deltas["bleu4"] == pytest.approx(-0.10)
|
|
|
|
def test_regression_at_tolerance_boundary(self) -> None:
|
|
# Use a value clearly at the boundary (accounting for float precision)
|
|
# The implementation checks delta < -tolerance (strictly less than)
|
|
report = detect_regression(
|
|
current={"bleu4": 0.50},
|
|
baseline={"bleu4": 0.50},
|
|
tolerance=0.05,
|
|
)
|
|
|
|
# Delta is 0.0, well within tolerance
|
|
assert not report.detected
|
|
assert report.deltas["bleu4"] == 0.0
|
|
|
|
def test_regression_just_beyond_tolerance(self) -> None:
|
|
report = detect_regression(
|
|
current={"bleu4": 0.6999},
|
|
baseline={"bleu4": 0.75},
|
|
tolerance=0.05,
|
|
)
|
|
|
|
# Delta is -0.0501, which is < -tolerance
|
|
assert report.detected
|
|
|
|
def test_multiple_metrics_any_regresses(self) -> None:
|
|
report = detect_regression(
|
|
current={"bleu4": 0.65, "rouge_l": 0.80},
|
|
baseline={"bleu4": 0.75, "rouge_l": 0.80},
|
|
tolerance=0.05,
|
|
)
|
|
|
|
assert report.detected
|
|
# Only bleu4 regressed
|
|
assert report.deltas["bleu4"] == pytest.approx(-0.10)
|
|
assert report.deltas["rouge_l"] == pytest.approx(0.0)
|
|
|
|
def test_report_contains_all_values(self) -> None:
|
|
baseline = {"bleu4": 0.75, "rouge_l": 0.80}
|
|
current = {"bleu4": 0.65, "rouge_l": 0.82}
|
|
|
|
report = detect_regression(current, baseline, tolerance=0.05)
|
|
|
|
assert report.baseline == baseline
|
|
assert report.current == current
|
|
assert report.tolerance == 0.05
|
|
assert "bleu4" in report.deltas
|
|
assert "rouge_l" in report.deltas
|
|
|
|
def test_missing_metric_in_current(self) -> None:
|
|
report = detect_regression(
|
|
current={},
|
|
baseline={"bleu4": 0.75},
|
|
tolerance=0.05,
|
|
)
|
|
|
|
# 0.0 - 0.75 = -0.75, which is a regression
|
|
assert report.detected
|
|
assert report.deltas["bleu4"] == pytest.approx(-0.75)
|