248 lines
8.1 KiB
Python
248 lines
8.1 KiB
Python
"""Tests for benchmark runner."""
|
|
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from veritext.benchmark.models import BenchmarkRun
|
|
from veritext.benchmark.runner import Benchmark
|
|
from veritext.core.exceptions import RegressionDetectedError
|
|
|
|
|
|
@pytest.fixture
|
|
def benchmark(tmp_path: Path) -> Benchmark:
|
|
"""Create a Benchmark instance with temporary storage."""
|
|
return Benchmark("test-suite", storage_path=tmp_path / "benchmarks")
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_data() -> tuple[list[str], list[str]]:
|
|
"""Sample candidates and references for testing."""
|
|
candidates = [
|
|
"The quick brown fox jumps over the lazy dog.",
|
|
"A fast auburn fox leaps above the sleepy hound.",
|
|
]
|
|
references = [
|
|
"The quick brown fox jumps over the lazy dog.",
|
|
"The swift brown fox jumps over the lazy dog.",
|
|
]
|
|
return candidates, references
|
|
|
|
|
|
class TestBenchmarkInit:
|
|
"""Tests for Benchmark initialisation."""
|
|
|
|
def test_creates_storage_directory(self, tmp_path: Path) -> None:
|
|
"""Benchmark creates storage directory on init."""
|
|
storage_path = tmp_path / "benchmarks"
|
|
Benchmark("my-suite", storage_path=storage_path)
|
|
|
|
assert storage_path.exists()
|
|
|
|
def test_name_property(self, benchmark: Benchmark) -> None:
|
|
"""Benchmark exposes its name."""
|
|
assert benchmark.name == "test-suite"
|
|
|
|
|
|
class TestEvaluate:
|
|
"""Tests for the evaluate method."""
|
|
|
|
def test_evaluate_stores_run(
|
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
|
) -> None:
|
|
"""Evaluate creates and stores a benchmark run."""
|
|
candidates, references = sample_data
|
|
|
|
run = benchmark.evaluate(candidates, references)
|
|
|
|
assert isinstance(run, BenchmarkRun)
|
|
assert run.benchmark_name == "test-suite"
|
|
assert run.sample_count == 2
|
|
|
|
def test_evaluate_returns_metrics(
|
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
|
) -> None:
|
|
"""Evaluate computes default metrics."""
|
|
candidates, references = sample_data
|
|
|
|
run = benchmark.evaluate(candidates, references)
|
|
|
|
# Default metrics are rouge_l and bleu4
|
|
assert "rouge_l" in run.metrics
|
|
assert "bleu4" in run.metrics
|
|
assert 0.0 <= run.metrics["rouge_l"] <= 1.0
|
|
assert 0.0 <= run.metrics["bleu4"] <= 1.0
|
|
|
|
def test_evaluate_custom_metrics(
|
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
|
) -> None:
|
|
"""Evaluate can compute custom metrics."""
|
|
candidates, references = sample_data
|
|
|
|
run = benchmark.evaluate(
|
|
candidates, references, metrics=["bleu1", "bleu2", "rouge1"]
|
|
)
|
|
|
|
assert "bleu1" in run.metrics
|
|
assert "bleu2" in run.metrics
|
|
assert "rouge1" in run.metrics
|
|
assert "bleu4" not in run.metrics # Not requested
|
|
|
|
def test_evaluate_with_metadata(
|
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
|
) -> None:
|
|
"""Evaluate can include metadata."""
|
|
candidates, references = sample_data
|
|
|
|
run = benchmark.evaluate(
|
|
candidates, references, metadata={"git_sha": "abc123", "model": "gpt-4"}
|
|
)
|
|
|
|
assert run.metadata == {"git_sha": "abc123", "model": "gpt-4"}
|
|
|
|
def test_evaluate_stores_retrievable(
|
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
|
) -> None:
|
|
"""Stored run can be retrieved."""
|
|
candidates, references = sample_data
|
|
run = benchmark.evaluate(candidates, references)
|
|
|
|
history = benchmark.get_history()
|
|
|
|
assert len(history) == 1
|
|
assert history[0].id == run.id
|
|
|
|
|
|
class TestCheckRegression:
|
|
"""Tests for regression checking."""
|
|
|
|
def test_check_no_runs(self, benchmark: Benchmark) -> None:
|
|
"""No regression when no runs exist."""
|
|
report = benchmark.check_regression()
|
|
|
|
assert not report.detected
|
|
assert report.baseline == {}
|
|
assert report.current == {}
|
|
|
|
def test_check_single_run(
|
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
|
) -> None:
|
|
"""No regression with single run (no baseline)."""
|
|
candidates, references = sample_data
|
|
benchmark.evaluate(candidates, references)
|
|
|
|
report = benchmark.check_regression()
|
|
|
|
# First run has no baseline to compare against
|
|
assert not report.detected
|
|
|
|
def test_check_stable_metrics(
|
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
|
) -> None:
|
|
"""No regression when metrics are stable."""
|
|
candidates, references = sample_data
|
|
|
|
# Run multiple times with same data
|
|
for _ in range(3):
|
|
benchmark.evaluate(candidates, references)
|
|
|
|
report = benchmark.check_regression()
|
|
assert not report.detected
|
|
|
|
def test_check_reports_regression(self, tmp_path: Path) -> None:
|
|
"""Reports regression when metrics drop significantly."""
|
|
benchmark = Benchmark("regress-test", storage_path=tmp_path / "benchmarks")
|
|
|
|
# First run with good metrics
|
|
good_candidates = ["The quick brown fox jumps."]
|
|
good_references = ["The quick brown fox jumps."]
|
|
benchmark.evaluate(good_candidates, good_references)
|
|
|
|
# Second run with worse metrics (different text)
|
|
bad_candidates = ["Something completely different here."]
|
|
benchmark.evaluate(bad_candidates, good_references)
|
|
|
|
report = benchmark.check_regression(tolerance=0.05)
|
|
|
|
# Should detect regression since second run is very different
|
|
assert report.detected or any(d < -0.05 for d in report.deltas.values())
|
|
|
|
|
|
class TestAssertNoRegression:
|
|
"""Tests for assert_no_regression method."""
|
|
|
|
def test_passes_when_stable(
|
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
|
) -> None:
|
|
"""Does not raise when metrics are stable."""
|
|
candidates, references = sample_data
|
|
|
|
for _ in range(3):
|
|
benchmark.evaluate(candidates, references)
|
|
|
|
# Should not raise
|
|
benchmark.assert_no_regression()
|
|
|
|
def test_raises_on_regression(self, tmp_path: Path) -> None:
|
|
"""Raises RegressionDetectedError when quality drops."""
|
|
benchmark = Benchmark("regress-test", storage_path=tmp_path / "benchmarks")
|
|
|
|
# Establish baseline with perfect match
|
|
perfect = ["The quick brown fox."]
|
|
benchmark.evaluate(perfect, perfect)
|
|
|
|
# Second run with terrible match
|
|
terrible = ["Completely unrelated text."]
|
|
benchmark.evaluate(terrible, perfect)
|
|
|
|
with pytest.raises(RegressionDetectedError):
|
|
benchmark.assert_no_regression(tolerance=0.05)
|
|
|
|
|
|
class TestGetHistory:
|
|
"""Tests for get_history method."""
|
|
|
|
def test_empty_history(self, benchmark: Benchmark) -> None:
|
|
"""Returns empty list when no runs."""
|
|
history = benchmark.get_history()
|
|
assert history == []
|
|
|
|
def test_returns_runs(
|
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
|
) -> None:
|
|
"""Returns benchmark runs."""
|
|
candidates, references = sample_data
|
|
|
|
run1 = benchmark.evaluate(candidates, references)
|
|
run2 = benchmark.evaluate(candidates, references)
|
|
|
|
history = benchmark.get_history()
|
|
|
|
assert len(history) == 2
|
|
assert history[0].id == run2.id # Most recent first
|
|
assert history[1].id == run1.id
|
|
|
|
def test_respects_limit(
|
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
|
) -> None:
|
|
"""Respects limit parameter."""
|
|
candidates, references = sample_data
|
|
|
|
for _ in range(5):
|
|
benchmark.evaluate(candidates, references)
|
|
|
|
history = benchmark.get_history(limit=3)
|
|
assert len(history) == 3
|
|
|
|
def test_default_limit(
|
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
|
) -> None:
|
|
"""Default limit is 20."""
|
|
candidates, references = sample_data
|
|
|
|
for _ in range(25):
|
|
benchmark.evaluate(candidates, references)
|
|
|
|
history = benchmark.get_history()
|
|
assert len(history) == 20
|