test(benchmark): add benchmark module tests

Comprehensive tests for models, storage, regression detection, and runner.
2026-02-03 18:10:13 +00:00
parent 40fa39485e
commit 6d1bece815
5 changed files with 919 additions and 0 deletions
--- a/tests/test_benchmark/test_models.py
+++ b/tests/test_benchmark/test_models.py
@@ -0,0 +1,145 @@
+"""Tests for benchmark data models."""
+
+from datetime import UTC, datetime
+
+import pytest
+from pydantic import ValidationError
+
+from veritext.benchmark.models import BenchmarkRun, RegressionReport
+
+
+class TestBenchmarkRun:
+    """Tests for BenchmarkRun model."""
+
+    def test_create_benchmark_run(self) -> None:
+        """BenchmarkRun can be created with required fields."""
+        run = BenchmarkRun(
+            id="test-id-123",
+            benchmark_name="test-benchmark",
+            timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
+            veritext_version="0.1.0-dev",
+            metrics={"bleu4": 0.75, "rouge_l": 0.82},
+            sample_count=100,
+        )
+
+        assert run.id == "test-id-123"
+        assert run.benchmark_name == "test-benchmark"
+        assert run.veritext_version == "0.1.0-dev"
+        assert run.metrics == {"bleu4": 0.75, "rouge_l": 0.82}
+        assert run.sample_count == 100
+        assert run.metadata == {}
+
+    def test_create_with_metadata(self) -> None:
+        """BenchmarkRun can include optional metadata."""
+        run = BenchmarkRun(
+            id="test-id-456",
+            benchmark_name="test-benchmark",
+            timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
+            veritext_version="0.1.0-dev",
+            metrics={"bleu4": 0.75},
+            sample_count=50,
+            metadata={"git_sha": "abc123", "model_version": "gpt-4"},
+        )
+
+        assert run.metadata == {"git_sha": "abc123", "model_version": "gpt-4"}
+
+    def test_frozen_model(self) -> None:
+        """BenchmarkRun is immutable."""
+        run = BenchmarkRun(
+            id="test-id",
+            benchmark_name="test",
+            timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
+            veritext_version="0.1.0",
+            metrics={"bleu4": 0.5},
+            sample_count=10,
+        )
+
+        with pytest.raises(ValidationError):
+            run.id = "new-id"  # type: ignore[misc]
+
+    def test_serialisation(self) -> None:
+        """BenchmarkRun can be serialised to dict."""
+        run = BenchmarkRun(
+            id="test-id",
+            benchmark_name="test",
+            timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
+            veritext_version="0.1.0",
+            metrics={"bleu4": 0.5},
+            sample_count=10,
+        )
+
+        data = run.model_dump()
+        assert data["id"] == "test-id"
+        assert data["benchmark_name"] == "test"
+        assert data["metrics"] == {"bleu4": 0.5}
+
+
+class TestRegressionReport:
+    """Tests for RegressionReport model."""
+
+    def test_no_regression_summary(self) -> None:
+        """Summary indicates no regression when detected is False."""
+        report = RegressionReport(
+            detected=False,
+            baseline={"bleu4": 0.75, "rouge_l": 0.80},
+            current={"bleu4": 0.76, "rouge_l": 0.81},
+            deltas={"bleu4": 0.01, "rouge_l": 0.01},
+            tolerance=0.05,
+        )
+
+        assert "No regression detected" in report.summary
+
+    def test_regression_summary(self) -> None:
+        """Summary lists regressed metrics when detected is True."""
+        report = RegressionReport(
+            detected=True,
+            baseline={"bleu4": 0.75, "rouge_l": 0.80},
+            current={"bleu4": 0.65, "rouge_l": 0.78},
+            deltas={"bleu4": -0.10, "rouge_l": -0.02},
+            tolerance=0.05,
+        )
+
+        assert "Regression detected" in report.summary
+        assert "bleu4" in report.summary
+        assert "0.6500" in report.summary
+        assert "baseline: 0.7500" in report.summary
+
+    def test_regression_excludes_within_tolerance(self) -> None:
+        """Summary only shows metrics that exceed tolerance."""
+        report = RegressionReport(
+            detected=True,
+            baseline={"bleu4": 0.75, "rouge_l": 0.80},
+            current={"bleu4": 0.65, "rouge_l": 0.78},
+            deltas={"bleu4": -0.10, "rouge_l": -0.02},
+            tolerance=0.05,
+        )
+
+        # rouge_l is -0.02, within tolerance of 0.05, so shouldn't appear
+        assert "rouge_l" not in report.summary
+        # bleu4 is -0.10, exceeds tolerance, so should appear
+        assert "bleu4" in report.summary
+
+    def test_frozen_model(self) -> None:
+        """RegressionReport is immutable."""
+        report = RegressionReport(
+            detected=False,
+            baseline={},
+            current={},
+            deltas={},
+            tolerance=0.05,
+        )
+
+        with pytest.raises(ValidationError):
+            report.detected = True  # type: ignore[misc]
+
+    def test_tolerance_in_summary(self) -> None:
+        """Summary includes tolerance threshold."""
+        report = RegressionReport(
+            detected=True,
+            baseline={"metric": 0.80},
+            current={"metric": 0.50},
+            deltas={"metric": -0.30},
+            tolerance=0.10,
+        )
+
+        assert "10.00%" in report.summary