benchmark tests

Comprehensive tests for models, storage, regression detection, and runner.
This commit is contained in:
2025-04-20 15:04:33 +00:00
parent 127eb9cac6
commit 9afa499af3
5 changed files with 826 additions and 0 deletions

View File

@@ -0,0 +1,218 @@
"""Tests for benchmark runner."""
from pathlib import Path
import pytest
from veritext.benchmark.models import BenchmarkRun
from veritext.benchmark.runner import Benchmark
from veritext.core.exceptions import RegressionDetectedError
@pytest.fixture
def benchmark(tmp_path: Path) -> Benchmark:
return Benchmark("test-suite", storage_path=tmp_path / "benchmarks")
@pytest.fixture
def sample_data() -> tuple[list[str], list[str]]:
candidates = [
"The quick brown fox jumps over the lazy dog.",
"A fast auburn fox leaps above the sleepy hound.",
]
references = [
"The quick brown fox jumps over the lazy dog.",
"The swift brown fox jumps over the lazy dog.",
]
return candidates, references
class TestBenchmarkInit:
def test_creates_storage_directory(self, tmp_path: Path) -> None:
storage_path = tmp_path / "benchmarks"
Benchmark("my-suite", storage_path=storage_path)
assert storage_path.exists()
def test_name_property(self, benchmark: Benchmark) -> None:
assert benchmark.name == "test-suite"
class TestEvaluate:
def test_evaluate_stores_run(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
candidates, references = sample_data
run = benchmark.evaluate(candidates, references)
assert isinstance(run, BenchmarkRun)
assert run.benchmark_name == "test-suite"
assert run.sample_count == 2
def test_evaluate_returns_metrics(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
candidates, references = sample_data
run = benchmark.evaluate(candidates, references)
# Default metrics are rouge_l and bleu4
assert "rouge_l" in run.metrics
assert "bleu4" in run.metrics
assert 0.0 <= run.metrics["rouge_l"] <= 1.0
assert 0.0 <= run.metrics["bleu4"] <= 1.0
def test_evaluate_custom_metrics(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
candidates, references = sample_data
run = benchmark.evaluate(
candidates, references, metrics=["bleu1", "bleu2", "rouge1"]
)
assert "bleu1" in run.metrics
assert "bleu2" in run.metrics
assert "rouge1" in run.metrics
assert "bleu4" not in run.metrics # Not requested
def test_evaluate_with_metadata(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
candidates, references = sample_data
run = benchmark.evaluate(
candidates, references, metadata={"git_sha": "abc123", "model": "gpt-4"}
)
assert run.metadata == {"git_sha": "abc123", "model": "gpt-4"}
def test_evaluate_stores_retrievable(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
candidates, references = sample_data
run = benchmark.evaluate(candidates, references)
history = benchmark.get_history()
assert len(history) == 1
assert history[0].id == run.id
class TestCheckRegression:
def test_check_no_runs(self, benchmark: Benchmark) -> None:
report = benchmark.check_regression()
assert not report.detected
assert report.baseline == {}
assert report.current == {}
def test_check_single_run(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
candidates, references = sample_data
benchmark.evaluate(candidates, references)
report = benchmark.check_regression()
# First run has no baseline to compare against
assert not report.detected
def test_check_stable_metrics(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
candidates, references = sample_data
# Run multiple times with same data
for _ in range(3):
benchmark.evaluate(candidates, references)
report = benchmark.check_regression()
assert not report.detected
def test_check_reports_regression(self, tmp_path: Path) -> None:
benchmark = Benchmark("regress-test", storage_path=tmp_path / "benchmarks")
# First run with good metrics
good_candidates = ["The quick brown fox jumps."]
good_references = ["The quick brown fox jumps."]
benchmark.evaluate(good_candidates, good_references)
# Second run with worse metrics (different text)
bad_candidates = ["Something completely different here."]
benchmark.evaluate(bad_candidates, good_references)
report = benchmark.check_regression(tolerance=0.05)
# Should detect regression since second run is very different
assert report.detected or any(d < -0.05 for d in report.deltas.values())
class TestAssertNoRegression:
def test_passes_when_stable(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
candidates, references = sample_data
for _ in range(3):
benchmark.evaluate(candidates, references)
# Should not raise
benchmark.assert_no_regression()
def test_raises_on_regression(self, tmp_path: Path) -> None:
benchmark = Benchmark("regress-test", storage_path=tmp_path / "benchmarks")
# Establish baseline with perfect match
perfect = ["The quick brown fox."]
benchmark.evaluate(perfect, perfect)
# Second run with terrible match
terrible = ["Completely unrelated text."]
benchmark.evaluate(terrible, perfect)
with pytest.raises(RegressionDetectedError):
benchmark.assert_no_regression(tolerance=0.05)
class TestGetHistory:
def test_empty_history(self, benchmark: Benchmark) -> None:
history = benchmark.get_history()
assert history == []
def test_returns_runs(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
candidates, references = sample_data
run1 = benchmark.evaluate(candidates, references)
run2 = benchmark.evaluate(candidates, references)
history = benchmark.get_history()
assert len(history) == 2
assert history[0].id == run2.id # Most recent first
assert history[1].id == run1.id
def test_respects_limit(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
candidates, references = sample_data
for _ in range(5):
benchmark.evaluate(candidates, references)
history = benchmark.get_history(limit=3)
assert len(history) == 3
def test_default_limit(
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
) -> None:
candidates, references = sample_data
for _ in range(25):
benchmark.evaluate(candidates, references)
history = benchmark.get_history()
assert len(history) == 20