test(benchmark): add benchmark module tests

Comprehensive tests for models, storage, regression detection, and runner.
This commit is contained in:
2026-02-03 18:10:13 +00:00
parent 40fa39485e
commit 6d1bece815
5 changed files with 919 additions and 0 deletions

View File

@@ -0,0 +1,145 @@
"""Tests for benchmark data models."""
from datetime import UTC, datetime
import pytest
from pydantic import ValidationError
from veritext.benchmark.models import BenchmarkRun, RegressionReport
class TestBenchmarkRun:
"""Tests for BenchmarkRun model."""
def test_create_benchmark_run(self) -> None:
"""BenchmarkRun can be created with required fields."""
run = BenchmarkRun(
id="test-id-123",
benchmark_name="test-benchmark",
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
veritext_version="0.1.0-dev",
metrics={"bleu4": 0.75, "rouge_l": 0.82},
sample_count=100,
)
assert run.id == "test-id-123"
assert run.benchmark_name == "test-benchmark"
assert run.veritext_version == "0.1.0-dev"
assert run.metrics == {"bleu4": 0.75, "rouge_l": 0.82}
assert run.sample_count == 100
assert run.metadata == {}
def test_create_with_metadata(self) -> None:
"""BenchmarkRun can include optional metadata."""
run = BenchmarkRun(
id="test-id-456",
benchmark_name="test-benchmark",
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
veritext_version="0.1.0-dev",
metrics={"bleu4": 0.75},
sample_count=50,
metadata={"git_sha": "abc123", "model_version": "gpt-4"},
)
assert run.metadata == {"git_sha": "abc123", "model_version": "gpt-4"}
def test_frozen_model(self) -> None:
"""BenchmarkRun is immutable."""
run = BenchmarkRun(
id="test-id",
benchmark_name="test",
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
veritext_version="0.1.0",
metrics={"bleu4": 0.5},
sample_count=10,
)
with pytest.raises(ValidationError):
run.id = "new-id" # type: ignore[misc]
def test_serialisation(self) -> None:
"""BenchmarkRun can be serialised to dict."""
run = BenchmarkRun(
id="test-id",
benchmark_name="test",
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
veritext_version="0.1.0",
metrics={"bleu4": 0.5},
sample_count=10,
)
data = run.model_dump()
assert data["id"] == "test-id"
assert data["benchmark_name"] == "test"
assert data["metrics"] == {"bleu4": 0.5}
class TestRegressionReport:
"""Tests for RegressionReport model."""
def test_no_regression_summary(self) -> None:
"""Summary indicates no regression when detected is False."""
report = RegressionReport(
detected=False,
baseline={"bleu4": 0.75, "rouge_l": 0.80},
current={"bleu4": 0.76, "rouge_l": 0.81},
deltas={"bleu4": 0.01, "rouge_l": 0.01},
tolerance=0.05,
)
assert "No regression detected" in report.summary
def test_regression_summary(self) -> None:
"""Summary lists regressed metrics when detected is True."""
report = RegressionReport(
detected=True,
baseline={"bleu4": 0.75, "rouge_l": 0.80},
current={"bleu4": 0.65, "rouge_l": 0.78},
deltas={"bleu4": -0.10, "rouge_l": -0.02},
tolerance=0.05,
)
assert "Regression detected" in report.summary
assert "bleu4" in report.summary
assert "0.6500" in report.summary
assert "baseline: 0.7500" in report.summary
def test_regression_excludes_within_tolerance(self) -> None:
"""Summary only shows metrics that exceed tolerance."""
report = RegressionReport(
detected=True,
baseline={"bleu4": 0.75, "rouge_l": 0.80},
current={"bleu4": 0.65, "rouge_l": 0.78},
deltas={"bleu4": -0.10, "rouge_l": -0.02},
tolerance=0.05,
)
# rouge_l is -0.02, within tolerance of 0.05, so shouldn't appear
assert "rouge_l" not in report.summary
# bleu4 is -0.10, exceeds tolerance, so should appear
assert "bleu4" in report.summary
def test_frozen_model(self) -> None:
"""RegressionReport is immutable."""
report = RegressionReport(
detected=False,
baseline={},
current={},
deltas={},
tolerance=0.05,
)
with pytest.raises(ValidationError):
report.detected = True # type: ignore[misc]
def test_tolerance_in_summary(self) -> None:
"""Summary includes tolerance threshold."""
report = RegressionReport(
detected=True,
baseline={"metric": 0.80},
current={"metric": 0.50},
deltas={"metric": -0.30},
tolerance=0.10,
)
assert "10.00%" in report.summary