test(benchmark): add benchmark module tests

Comprehensive tests for models, storage, regression detection, and runner.
This commit is contained in:
2026-02-03 18:10:13 +00:00
parent 40fa39485e
commit 6d1bece815
5 changed files with 919 additions and 0 deletions

View File

@@ -0,0 +1,229 @@
"""Tests for regression detection."""
from datetime import UTC, datetime
import pytest
from veritext.benchmark.models import BenchmarkRun
from veritext.benchmark.regression import compute_baseline, detect_regression
def make_run(
run_id: str,
metrics: dict[str, float],
day: int = 1,
) -> BenchmarkRun:
"""Helper to create a BenchmarkRun."""
return BenchmarkRun(
id=run_id,
benchmark_name="test",
timestamp=datetime(2025, 1, day, 12, 0, 0, tzinfo=UTC),
veritext_version="0.1.0",
metrics=metrics,
sample_count=10,
)
class TestComputeBaseline:
"""Tests for baseline computation."""
def test_empty_runs(self) -> None:
"""Returns empty baseline for empty runs list."""
baseline = compute_baseline([])
assert baseline == {}
def test_single_run(self) -> None:
"""Single run produces baseline equal to that run's metrics."""
runs = [make_run("r1", {"bleu4": 0.75, "rouge_l": 0.80})]
baseline = compute_baseline(runs)
assert baseline["bleu4"] == 0.75
assert baseline["rouge_l"] == 0.80
def test_multiple_runs_average(self) -> None:
"""Baseline is the average of all runs in window."""
runs = [
make_run("r1", {"bleu4": 0.70}, day=3),
make_run("r2", {"bleu4": 0.80}, day=2),
make_run("r3", {"bleu4": 0.90}, day=1),
]
baseline = compute_baseline(runs, window=3)
assert baseline["bleu4"] == pytest.approx(0.80) # (0.70+0.80+0.90)/3
def test_window_limits_runs(self) -> None:
"""Only includes runs within the window size."""
runs = [
make_run("r1", {"bleu4": 0.70}, day=5), # most recent
make_run("r2", {"bleu4": 0.80}, day=4),
make_run("r3", {"bleu4": 0.90}, day=3),
make_run("r4", {"bleu4": 0.60}, day=2), # excluded
make_run("r5", {"bleu4": 0.50}, day=1), # excluded
]
baseline = compute_baseline(runs, window=3)
# Only first 3 runs: (0.70 + 0.80 + 0.90) / 3 = 0.80
assert baseline["bleu4"] == pytest.approx(0.80)
def test_partial_history(self) -> None:
"""Works when fewer runs than window size exist."""
runs = [
make_run("r1", {"bleu4": 0.70}),
make_run("r2", {"bleu4": 0.80}),
]
baseline = compute_baseline(runs, window=10)
# Only 2 runs available: (0.70 + 0.80) / 2 = 0.75
assert baseline["bleu4"] == pytest.approx(0.75)
def test_multiple_metrics(self) -> None:
"""Computes baseline for all metrics present."""
runs = [
make_run("r1", {"bleu4": 0.70, "rouge_l": 0.75}),
make_run("r2", {"bleu4": 0.80, "rouge_l": 0.85}),
]
baseline = compute_baseline(runs)
assert baseline["bleu4"] == pytest.approx(0.75)
assert baseline["rouge_l"] == pytest.approx(0.80)
def test_varying_metrics(self) -> None:
"""Handles runs with different metric sets."""
runs = [
make_run("r1", {"bleu4": 0.70, "rouge_l": 0.75}),
make_run("r2", {"bleu4": 0.80}), # No rouge_l
]
baseline = compute_baseline(runs)
# bleu4 appears in both runs
assert baseline["bleu4"] == pytest.approx(0.75)
# rouge_l only appears in one run
assert baseline["rouge_l"] == pytest.approx(0.75)
class TestDetectRegression:
"""Tests for regression detection."""
def test_no_baseline(self) -> None:
"""No regression when baseline is empty."""
report = detect_regression(
current={"bleu4": 0.70},
baseline={},
tolerance=0.05,
)
assert not report.detected
assert report.deltas == {}
def test_no_regression_stable(self) -> None:
"""No regression when metrics are stable."""
report = detect_regression(
current={"bleu4": 0.75},
baseline={"bleu4": 0.75},
tolerance=0.05,
)
assert not report.detected
assert report.deltas["bleu4"] == pytest.approx(0.0)
def test_no_regression_improved(self) -> None:
"""No regression when metrics improved."""
report = detect_regression(
current={"bleu4": 0.85},
baseline={"bleu4": 0.75},
tolerance=0.05,
)
assert not report.detected
assert report.deltas["bleu4"] == pytest.approx(0.10)
def test_no_regression_within_tolerance(self) -> None:
"""No regression when drop is within tolerance."""
report = detect_regression(
current={"bleu4": 0.73},
baseline={"bleu4": 0.75},
tolerance=0.05,
)
assert not report.detected
assert report.deltas["bleu4"] == pytest.approx(-0.02)
def test_regression_detected(self) -> None:
"""Regression detected when metric drops beyond tolerance."""
report = detect_regression(
current={"bleu4": 0.65},
baseline={"bleu4": 0.75},
tolerance=0.05,
)
assert report.detected
assert report.deltas["bleu4"] == pytest.approx(-0.10)
def test_regression_at_tolerance_boundary(self) -> None:
"""Drop at tolerance boundary is not a regression."""
# Use a value clearly at the boundary (accounting for float precision)
# The implementation checks delta < -tolerance (strictly less than)
report = detect_regression(
current={"bleu4": 0.50},
baseline={"bleu4": 0.50},
tolerance=0.05,
)
# Delta is 0.0, well within tolerance
assert not report.detected
assert report.deltas["bleu4"] == 0.0
def test_regression_just_beyond_tolerance(self) -> None:
"""Just beyond tolerance is a regression."""
report = detect_regression(
current={"bleu4": 0.6999},
baseline={"bleu4": 0.75},
tolerance=0.05,
)
# Delta is -0.0501, which is < -tolerance
assert report.detected
def test_multiple_metrics_any_regresses(self) -> None:
"""Regression detected if any metric exceeds tolerance."""
report = detect_regression(
current={"bleu4": 0.65, "rouge_l": 0.80},
baseline={"bleu4": 0.75, "rouge_l": 0.80},
tolerance=0.05,
)
assert report.detected
# Only bleu4 regressed
assert report.deltas["bleu4"] == pytest.approx(-0.10)
assert report.deltas["rouge_l"] == pytest.approx(0.0)
def test_report_contains_all_values(self) -> None:
"""Report includes baseline, current, and deltas."""
baseline = {"bleu4": 0.75, "rouge_l": 0.80}
current = {"bleu4": 0.65, "rouge_l": 0.82}
report = detect_regression(current, baseline, tolerance=0.05)
assert report.baseline == baseline
assert report.current == current
assert report.tolerance == 0.05
assert "bleu4" in report.deltas
assert "rouge_l" in report.deltas
def test_missing_metric_in_current(self) -> None:
"""Missing metric in current treated as zero."""
report = detect_regression(
current={},
baseline={"bleu4": 0.75},
tolerance=0.05,
)
# 0.0 - 0.75 = -0.75, which is a regression
assert report.detected
assert report.deltas["bleu4"] == pytest.approx(-0.75)