benchmark tests
Comprehensive tests for models, storage, regression detection, and runner.
This commit is contained in:
207
tests/test_benchmark/test_regression.py
Normal file
207
tests/test_benchmark/test_regression.py
Normal file
@@ -0,0 +1,207 @@
|
||||
"""Tests for regression detection."""
|
||||
|
||||
from datetime import UTC, datetime
|
||||
|
||||
import pytest
|
||||
|
||||
from veritext.benchmark.models import BenchmarkRun
|
||||
from veritext.benchmark.regression import compute_baseline, detect_regression
|
||||
|
||||
|
||||
def make_run(
|
||||
run_id: str,
|
||||
metrics: dict[str, float],
|
||||
day: int = 1,
|
||||
) -> BenchmarkRun:
|
||||
return BenchmarkRun(
|
||||
id=run_id,
|
||||
benchmark_name="test",
|
||||
timestamp=datetime(2025, 1, day, 12, 0, 0, tzinfo=UTC),
|
||||
veritext_version="0.1.0",
|
||||
metrics=metrics,
|
||||
sample_count=10,
|
||||
)
|
||||
|
||||
|
||||
class TestComputeBaseline:
|
||||
def test_empty_runs(self) -> None:
|
||||
baseline = compute_baseline([])
|
||||
assert baseline == {}
|
||||
|
||||
def test_single_run(self) -> None:
|
||||
runs = [make_run("r1", {"bleu4": 0.75, "rouge_l": 0.80})]
|
||||
|
||||
baseline = compute_baseline(runs)
|
||||
|
||||
assert baseline["bleu4"] == 0.75
|
||||
assert baseline["rouge_l"] == 0.80
|
||||
|
||||
def test_multiple_runs_average(self) -> None:
|
||||
runs = [
|
||||
make_run("r1", {"bleu4": 0.70}, day=3),
|
||||
make_run("r2", {"bleu4": 0.80}, day=2),
|
||||
make_run("r3", {"bleu4": 0.90}, day=1),
|
||||
]
|
||||
|
||||
baseline = compute_baseline(runs, window=3)
|
||||
|
||||
assert baseline["bleu4"] == pytest.approx(0.80) # (0.70+0.80+0.90)/3
|
||||
|
||||
def test_window_limits_runs(self) -> None:
|
||||
runs = [
|
||||
make_run("r1", {"bleu4": 0.70}, day=5), # most recent
|
||||
make_run("r2", {"bleu4": 0.80}, day=4),
|
||||
make_run("r3", {"bleu4": 0.90}, day=3),
|
||||
make_run("r4", {"bleu4": 0.60}, day=2), # excluded
|
||||
make_run("r5", {"bleu4": 0.50}, day=1), # excluded
|
||||
]
|
||||
|
||||
baseline = compute_baseline(runs, window=3)
|
||||
|
||||
# Only first 3 runs: (0.70 + 0.80 + 0.90) / 3 = 0.80
|
||||
assert baseline["bleu4"] == pytest.approx(0.80)
|
||||
|
||||
def test_partial_history(self) -> None:
|
||||
runs = [
|
||||
make_run("r1", {"bleu4": 0.70}),
|
||||
make_run("r2", {"bleu4": 0.80}),
|
||||
]
|
||||
|
||||
baseline = compute_baseline(runs, window=10)
|
||||
|
||||
# Only 2 runs available: (0.70 + 0.80) / 2 = 0.75
|
||||
assert baseline["bleu4"] == pytest.approx(0.75)
|
||||
|
||||
def test_multiple_metrics(self) -> None:
|
||||
runs = [
|
||||
make_run("r1", {"bleu4": 0.70, "rouge_l": 0.75}),
|
||||
make_run("r2", {"bleu4": 0.80, "rouge_l": 0.85}),
|
||||
]
|
||||
|
||||
baseline = compute_baseline(runs)
|
||||
|
||||
assert baseline["bleu4"] == pytest.approx(0.75)
|
||||
assert baseline["rouge_l"] == pytest.approx(0.80)
|
||||
|
||||
def test_varying_metrics(self) -> None:
|
||||
runs = [
|
||||
make_run("r1", {"bleu4": 0.70, "rouge_l": 0.75}),
|
||||
make_run("r2", {"bleu4": 0.80}), # No rouge_l
|
||||
]
|
||||
|
||||
baseline = compute_baseline(runs)
|
||||
|
||||
# bleu4 appears in both runs
|
||||
assert baseline["bleu4"] == pytest.approx(0.75)
|
||||
# rouge_l only appears in one run
|
||||
assert baseline["rouge_l"] == pytest.approx(0.75)
|
||||
|
||||
|
||||
class TestDetectRegression:
|
||||
def test_no_baseline(self) -> None:
|
||||
report = detect_regression(
|
||||
current={"bleu4": 0.70},
|
||||
baseline={},
|
||||
tolerance=0.05,
|
||||
)
|
||||
|
||||
assert not report.detected
|
||||
assert report.deltas == {}
|
||||
|
||||
def test_no_regression_stable(self) -> None:
|
||||
report = detect_regression(
|
||||
current={"bleu4": 0.75},
|
||||
baseline={"bleu4": 0.75},
|
||||
tolerance=0.05,
|
||||
)
|
||||
|
||||
assert not report.detected
|
||||
assert report.deltas["bleu4"] == pytest.approx(0.0)
|
||||
|
||||
def test_no_regression_improved(self) -> None:
|
||||
report = detect_regression(
|
||||
current={"bleu4": 0.85},
|
||||
baseline={"bleu4": 0.75},
|
||||
tolerance=0.05,
|
||||
)
|
||||
|
||||
assert not report.detected
|
||||
assert report.deltas["bleu4"] == pytest.approx(0.10)
|
||||
|
||||
def test_no_regression_within_tolerance(self) -> None:
|
||||
report = detect_regression(
|
||||
current={"bleu4": 0.73},
|
||||
baseline={"bleu4": 0.75},
|
||||
tolerance=0.05,
|
||||
)
|
||||
|
||||
assert not report.detected
|
||||
assert report.deltas["bleu4"] == pytest.approx(-0.02)
|
||||
|
||||
def test_regression_detected(self) -> None:
|
||||
report = detect_regression(
|
||||
current={"bleu4": 0.65},
|
||||
baseline={"bleu4": 0.75},
|
||||
tolerance=0.05,
|
||||
)
|
||||
|
||||
assert report.detected
|
||||
assert report.deltas["bleu4"] == pytest.approx(-0.10)
|
||||
|
||||
def test_regression_at_tolerance_boundary(self) -> None:
|
||||
# Use a value clearly at the boundary (accounting for float precision)
|
||||
# The implementation checks delta < -tolerance (strictly less than)
|
||||
report = detect_regression(
|
||||
current={"bleu4": 0.50},
|
||||
baseline={"bleu4": 0.50},
|
||||
tolerance=0.05,
|
||||
)
|
||||
|
||||
# Delta is 0.0, well within tolerance
|
||||
assert not report.detected
|
||||
assert report.deltas["bleu4"] == 0.0
|
||||
|
||||
def test_regression_just_beyond_tolerance(self) -> None:
|
||||
report = detect_regression(
|
||||
current={"bleu4": 0.6999},
|
||||
baseline={"bleu4": 0.75},
|
||||
tolerance=0.05,
|
||||
)
|
||||
|
||||
# Delta is -0.0501, which is < -tolerance
|
||||
assert report.detected
|
||||
|
||||
def test_multiple_metrics_any_regresses(self) -> None:
|
||||
report = detect_regression(
|
||||
current={"bleu4": 0.65, "rouge_l": 0.80},
|
||||
baseline={"bleu4": 0.75, "rouge_l": 0.80},
|
||||
tolerance=0.05,
|
||||
)
|
||||
|
||||
assert report.detected
|
||||
# Only bleu4 regressed
|
||||
assert report.deltas["bleu4"] == pytest.approx(-0.10)
|
||||
assert report.deltas["rouge_l"] == pytest.approx(0.0)
|
||||
|
||||
def test_report_contains_all_values(self) -> None:
|
||||
baseline = {"bleu4": 0.75, "rouge_l": 0.80}
|
||||
current = {"bleu4": 0.65, "rouge_l": 0.82}
|
||||
|
||||
report = detect_regression(current, baseline, tolerance=0.05)
|
||||
|
||||
assert report.baseline == baseline
|
||||
assert report.current == current
|
||||
assert report.tolerance == 0.05
|
||||
assert "bleu4" in report.deltas
|
||||
assert "rouge_l" in report.deltas
|
||||
|
||||
def test_missing_metric_in_current(self) -> None:
|
||||
report = detect_regression(
|
||||
current={},
|
||||
baseline={"bleu4": 0.75},
|
||||
tolerance=0.05,
|
||||
)
|
||||
|
||||
# 0.0 - 0.75 = -0.75, which is a regression
|
||||
assert report.detected
|
||||
assert report.deltas["bleu4"] == pytest.approx(-0.75)
|
||||
Reference in New Issue
Block a user