Files
veritext/tests/test_benchmark/test_runner.py
Kai Chappell 6d1bece815 test(benchmark): add benchmark module tests
Comprehensive tests for models, storage, regression detection, and runner.
2026-02-03 18:10:13 +00:00

248 lines
8.1 KiB
Python

"""Tests for benchmark runner."""
from pathlib import Path
import pytest
from veritext.benchmark.models import BenchmarkRun
from veritext.benchmark.runner import Benchmark
from veritext.core.exceptions import RegressionDetectedError
@pytest.fixture
def benchmark(tmp_path: Path) -> Benchmark:
"""Create a Benchmark instance with temporary storage."""
return Benchmark("test-suite", storage_path=tmp_path / "benchmarks")
@pytest.fixture
def sample_data() -> tuple[list[str], list[str]]:
"""Sample candidates and references for testing."""
candidates = [
"The quick brown fox jumps over the lazy dog.",
"A fast auburn fox leaps above the sleepy hound.",
]
references = [
"The quick brown fox jumps over the lazy dog.",
"The swift brown fox jumps over the lazy dog.",
]
return candidates, references
class TestBenchmarkInit:
"""Tests for Benchmark initialisation."""
def test_creates_storage_directory(self, tmp_path: Path) -> None:
"""Benchmark creates storage directory on init."""
storage_path = tmp_path / "benchmarks"
Benchmark("my-suite", storage_path=storage_path)
assert storage_path.exists()
def test_name_property(self, benchmark: Benchmark) -> None:
"""Benchmark exposes its name."""
assert benchmark.name == "test-suite"
class TestEvaluate:
"""Tests for the evaluate method."""
def test_evaluate_stores_run(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
"""Evaluate creates and stores a benchmark run."""
candidates, references = sample_data
run = benchmark.evaluate(candidates, references)
assert isinstance(run, BenchmarkRun)
assert run.benchmark_name == "test-suite"
assert run.sample_count == 2
def test_evaluate_returns_metrics(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
"""Evaluate computes default metrics."""
candidates, references = sample_data
run = benchmark.evaluate(candidates, references)
# Default metrics are rouge_l and bleu4
assert "rouge_l" in run.metrics
assert "bleu4" in run.metrics
assert 0.0 <= run.metrics["rouge_l"] <= 1.0
assert 0.0 <= run.metrics["bleu4"] <= 1.0
def test_evaluate_custom_metrics(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
"""Evaluate can compute custom metrics."""
candidates, references = sample_data
run = benchmark.evaluate(
candidates, references, metrics=["bleu1", "bleu2", "rouge1"]
)
assert "bleu1" in run.metrics
assert "bleu2" in run.metrics
assert "rouge1" in run.metrics
assert "bleu4" not in run.metrics # Not requested
def test_evaluate_with_metadata(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
"""Evaluate can include metadata."""
candidates, references = sample_data
run = benchmark.evaluate(
candidates, references, metadata={"git_sha": "abc123", "model": "gpt-4"}
)
assert run.metadata == {"git_sha": "abc123", "model": "gpt-4"}
def test_evaluate_stores_retrievable(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
"""Stored run can be retrieved."""
candidates, references = sample_data
run = benchmark.evaluate(candidates, references)
history = benchmark.get_history()
assert len(history) == 1
assert history[0].id == run.id
class TestCheckRegression:
"""Tests for regression checking."""
def test_check_no_runs(self, benchmark: Benchmark) -> None:
"""No regression when no runs exist."""
report = benchmark.check_regression()
assert not report.detected
assert report.baseline == {}
assert report.current == {}
def test_check_single_run(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
"""No regression with single run (no baseline)."""
candidates, references = sample_data
benchmark.evaluate(candidates, references)
report = benchmark.check_regression()
# First run has no baseline to compare against
assert not report.detected
def test_check_stable_metrics(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
"""No regression when metrics are stable."""
candidates, references = sample_data
# Run multiple times with same data
for _ in range(3):
benchmark.evaluate(candidates, references)
report = benchmark.check_regression()
assert not report.detected
def test_check_reports_regression(self, tmp_path: Path) -> None:
"""Reports regression when metrics drop significantly."""
benchmark = Benchmark("regress-test", storage_path=tmp_path / "benchmarks")
# First run with good metrics
good_candidates = ["The quick brown fox jumps."]
good_references = ["The quick brown fox jumps."]
benchmark.evaluate(good_candidates, good_references)
# Second run with worse metrics (different text)
bad_candidates = ["Something completely different here."]
benchmark.evaluate(bad_candidates, good_references)
report = benchmark.check_regression(tolerance=0.05)
# Should detect regression since second run is very different
assert report.detected or any(d < -0.05 for d in report.deltas.values())
class TestAssertNoRegression:
"""Tests for assert_no_regression method."""
def test_passes_when_stable(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
"""Does not raise when metrics are stable."""
candidates, references = sample_data
for _ in range(3):
benchmark.evaluate(candidates, references)
# Should not raise
benchmark.assert_no_regression()
def test_raises_on_regression(self, tmp_path: Path) -> None:
"""Raises RegressionDetectedError when quality drops."""
benchmark = Benchmark("regress-test", storage_path=tmp_path / "benchmarks")
# Establish baseline with perfect match
perfect = ["The quick brown fox."]
benchmark.evaluate(perfect, perfect)
# Second run with terrible match
terrible = ["Completely unrelated text."]
benchmark.evaluate(terrible, perfect)
with pytest.raises(RegressionDetectedError):
benchmark.assert_no_regression(tolerance=0.05)
class TestGetHistory:
"""Tests for get_history method."""
def test_empty_history(self, benchmark: Benchmark) -> None:
"""Returns empty list when no runs."""
history = benchmark.get_history()
assert history == []
def test_returns_runs(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
"""Returns benchmark runs."""
candidates, references = sample_data
run1 = benchmark.evaluate(candidates, references)
run2 = benchmark.evaluate(candidates, references)
history = benchmark.get_history()
assert len(history) == 2
assert history[0].id == run2.id # Most recent first
assert history[1].id == run1.id
def test_respects_limit(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
"""Respects limit parameter."""
candidates, references = sample_data
for _ in range(5):
benchmark.evaluate(candidates, references)
history = benchmark.get_history(limit=3)
assert len(history) == 3
def test_default_limit(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
"""Default limit is 20."""
candidates, references = sample_data
for _ in range(25):
benchmark.evaluate(candidates, references)
history = benchmark.get_history()
assert len(history) == 20