"""Tests for benchmark runner.""" from pathlib import Path import pytest from veritext.benchmark.models import BenchmarkRun from veritext.benchmark.runner import Benchmark from veritext.core.exceptions import RegressionDetectedError @pytest.fixture def benchmark(tmp_path: Path) -> Benchmark: """Create a Benchmark instance with temporary storage.""" return Benchmark("test-suite", storage_path=tmp_path / "benchmarks") @pytest.fixture def sample_data() -> tuple[list[str], list[str]]: """Sample candidates and references for testing.""" candidates = [ "The quick brown fox jumps over the lazy dog.", "A fast auburn fox leaps above the sleepy hound.", ] references = [ "The quick brown fox jumps over the lazy dog.", "The swift brown fox jumps over the lazy dog.", ] return candidates, references class TestBenchmarkInit: """Tests for Benchmark initialisation.""" def test_creates_storage_directory(self, tmp_path: Path) -> None: """Benchmark creates storage directory on init.""" storage_path = tmp_path / "benchmarks" Benchmark("my-suite", storage_path=storage_path) assert storage_path.exists() def test_name_property(self, benchmark: Benchmark) -> None: """Benchmark exposes its name.""" assert benchmark.name == "test-suite" class TestEvaluate: """Tests for the evaluate method.""" def test_evaluate_stores_run( self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]] ) -> None: """Evaluate creates and stores a benchmark run.""" candidates, references = sample_data run = benchmark.evaluate(candidates, references) assert isinstance(run, BenchmarkRun) assert run.benchmark_name == "test-suite" assert run.sample_count == 2 def test_evaluate_returns_metrics( self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]] ) -> None: """Evaluate computes default metrics.""" candidates, references = sample_data run = benchmark.evaluate(candidates, references) # Default metrics are rouge_l and bleu4 assert "rouge_l" in run.metrics assert "bleu4" in run.metrics assert 0.0 <= run.metrics["rouge_l"] <= 1.0 assert 0.0 <= run.metrics["bleu4"] <= 1.0 def test_evaluate_custom_metrics( self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]] ) -> None: """Evaluate can compute custom metrics.""" candidates, references = sample_data run = benchmark.evaluate( candidates, references, metrics=["bleu1", "bleu2", "rouge1"] ) assert "bleu1" in run.metrics assert "bleu2" in run.metrics assert "rouge1" in run.metrics assert "bleu4" not in run.metrics # Not requested def test_evaluate_with_metadata( self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]] ) -> None: """Evaluate can include metadata.""" candidates, references = sample_data run = benchmark.evaluate( candidates, references, metadata={"git_sha": "abc123", "model": "gpt-4"} ) assert run.metadata == {"git_sha": "abc123", "model": "gpt-4"} def test_evaluate_stores_retrievable( self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]] ) -> None: """Stored run can be retrieved.""" candidates, references = sample_data run = benchmark.evaluate(candidates, references) history = benchmark.get_history() assert len(history) == 1 assert history[0].id == run.id class TestCheckRegression: """Tests for regression checking.""" def test_check_no_runs(self, benchmark: Benchmark) -> None: """No regression when no runs exist.""" report = benchmark.check_regression() assert not report.detected assert report.baseline == {} assert report.current == {} def test_check_single_run( self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]] ) -> None: """No regression with single run (no baseline).""" candidates, references = sample_data benchmark.evaluate(candidates, references) report = benchmark.check_regression() # First run has no baseline to compare against assert not report.detected def test_check_stable_metrics( self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]] ) -> None: """No regression when metrics are stable.""" candidates, references = sample_data # Run multiple times with same data for _ in range(3): benchmark.evaluate(candidates, references) report = benchmark.check_regression() assert not report.detected def test_check_reports_regression(self, tmp_path: Path) -> None: """Reports regression when metrics drop significantly.""" benchmark = Benchmark("regress-test", storage_path=tmp_path / "benchmarks") # First run with good metrics good_candidates = ["The quick brown fox jumps."] good_references = ["The quick brown fox jumps."] benchmark.evaluate(good_candidates, good_references) # Second run with worse metrics (different text) bad_candidates = ["Something completely different here."] benchmark.evaluate(bad_candidates, good_references) report = benchmark.check_regression(tolerance=0.05) # Should detect regression since second run is very different assert report.detected or any(d < -0.05 for d in report.deltas.values()) class TestAssertNoRegression: """Tests for assert_no_regression method.""" def test_passes_when_stable( self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]] ) -> None: """Does not raise when metrics are stable.""" candidates, references = sample_data for _ in range(3): benchmark.evaluate(candidates, references) # Should not raise benchmark.assert_no_regression() def test_raises_on_regression(self, tmp_path: Path) -> None: """Raises RegressionDetectedError when quality drops.""" benchmark = Benchmark("regress-test", storage_path=tmp_path / "benchmarks") # Establish baseline with perfect match perfect = ["The quick brown fox."] benchmark.evaluate(perfect, perfect) # Second run with terrible match terrible = ["Completely unrelated text."] benchmark.evaluate(terrible, perfect) with pytest.raises(RegressionDetectedError): benchmark.assert_no_regression(tolerance=0.05) class TestGetHistory: """Tests for get_history method.""" def test_empty_history(self, benchmark: Benchmark) -> None: """Returns empty list when no runs.""" history = benchmark.get_history() assert history == [] def test_returns_runs( self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]] ) -> None: """Returns benchmark runs.""" candidates, references = sample_data run1 = benchmark.evaluate(candidates, references) run2 = benchmark.evaluate(candidates, references) history = benchmark.get_history() assert len(history) == 2 assert history[0].id == run2.id # Most recent first assert history[1].id == run1.id def test_respects_limit( self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]] ) -> None: """Respects limit parameter.""" candidates, references = sample_data for _ in range(5): benchmark.evaluate(candidates, references) history = benchmark.get_history(limit=3) assert len(history) == 3 def test_default_limit( self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]] ) -> None: """Default limit is 20.""" candidates, references = sample_data for _ in range(25): benchmark.evaluate(candidates, references) history = benchmark.get_history() assert len(history) == 20