test(benchmark): add benchmark module tests
Comprehensive tests for models, storage, regression detection, and runner.
This commit is contained in:
145
tests/test_benchmark/test_models.py
Normal file
145
tests/test_benchmark/test_models.py
Normal file
@@ -0,0 +1,145 @@
|
||||
"""Tests for benchmark data models."""
|
||||
|
||||
from datetime import UTC, datetime
|
||||
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from veritext.benchmark.models import BenchmarkRun, RegressionReport
|
||||
|
||||
|
||||
class TestBenchmarkRun:
|
||||
"""Tests for BenchmarkRun model."""
|
||||
|
||||
def test_create_benchmark_run(self) -> None:
|
||||
"""BenchmarkRun can be created with required fields."""
|
||||
run = BenchmarkRun(
|
||||
id="test-id-123",
|
||||
benchmark_name="test-benchmark",
|
||||
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
|
||||
veritext_version="0.1.0-dev",
|
||||
metrics={"bleu4": 0.75, "rouge_l": 0.82},
|
||||
sample_count=100,
|
||||
)
|
||||
|
||||
assert run.id == "test-id-123"
|
||||
assert run.benchmark_name == "test-benchmark"
|
||||
assert run.veritext_version == "0.1.0-dev"
|
||||
assert run.metrics == {"bleu4": 0.75, "rouge_l": 0.82}
|
||||
assert run.sample_count == 100
|
||||
assert run.metadata == {}
|
||||
|
||||
def test_create_with_metadata(self) -> None:
|
||||
"""BenchmarkRun can include optional metadata."""
|
||||
run = BenchmarkRun(
|
||||
id="test-id-456",
|
||||
benchmark_name="test-benchmark",
|
||||
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
|
||||
veritext_version="0.1.0-dev",
|
||||
metrics={"bleu4": 0.75},
|
||||
sample_count=50,
|
||||
metadata={"git_sha": "abc123", "model_version": "gpt-4"},
|
||||
)
|
||||
|
||||
assert run.metadata == {"git_sha": "abc123", "model_version": "gpt-4"}
|
||||
|
||||
def test_frozen_model(self) -> None:
|
||||
"""BenchmarkRun is immutable."""
|
||||
run = BenchmarkRun(
|
||||
id="test-id",
|
||||
benchmark_name="test",
|
||||
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
|
||||
veritext_version="0.1.0",
|
||||
metrics={"bleu4": 0.5},
|
||||
sample_count=10,
|
||||
)
|
||||
|
||||
with pytest.raises(ValidationError):
|
||||
run.id = "new-id" # type: ignore[misc]
|
||||
|
||||
def test_serialisation(self) -> None:
|
||||
"""BenchmarkRun can be serialised to dict."""
|
||||
run = BenchmarkRun(
|
||||
id="test-id",
|
||||
benchmark_name="test",
|
||||
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
|
||||
veritext_version="0.1.0",
|
||||
metrics={"bleu4": 0.5},
|
||||
sample_count=10,
|
||||
)
|
||||
|
||||
data = run.model_dump()
|
||||
assert data["id"] == "test-id"
|
||||
assert data["benchmark_name"] == "test"
|
||||
assert data["metrics"] == {"bleu4": 0.5}
|
||||
|
||||
|
||||
class TestRegressionReport:
|
||||
"""Tests for RegressionReport model."""
|
||||
|
||||
def test_no_regression_summary(self) -> None:
|
||||
"""Summary indicates no regression when detected is False."""
|
||||
report = RegressionReport(
|
||||
detected=False,
|
||||
baseline={"bleu4": 0.75, "rouge_l": 0.80},
|
||||
current={"bleu4": 0.76, "rouge_l": 0.81},
|
||||
deltas={"bleu4": 0.01, "rouge_l": 0.01},
|
||||
tolerance=0.05,
|
||||
)
|
||||
|
||||
assert "No regression detected" in report.summary
|
||||
|
||||
def test_regression_summary(self) -> None:
|
||||
"""Summary lists regressed metrics when detected is True."""
|
||||
report = RegressionReport(
|
||||
detected=True,
|
||||
baseline={"bleu4": 0.75, "rouge_l": 0.80},
|
||||
current={"bleu4": 0.65, "rouge_l": 0.78},
|
||||
deltas={"bleu4": -0.10, "rouge_l": -0.02},
|
||||
tolerance=0.05,
|
||||
)
|
||||
|
||||
assert "Regression detected" in report.summary
|
||||
assert "bleu4" in report.summary
|
||||
assert "0.6500" in report.summary
|
||||
assert "baseline: 0.7500" in report.summary
|
||||
|
||||
def test_regression_excludes_within_tolerance(self) -> None:
|
||||
"""Summary only shows metrics that exceed tolerance."""
|
||||
report = RegressionReport(
|
||||
detected=True,
|
||||
baseline={"bleu4": 0.75, "rouge_l": 0.80},
|
||||
current={"bleu4": 0.65, "rouge_l": 0.78},
|
||||
deltas={"bleu4": -0.10, "rouge_l": -0.02},
|
||||
tolerance=0.05,
|
||||
)
|
||||
|
||||
# rouge_l is -0.02, within tolerance of 0.05, so shouldn't appear
|
||||
assert "rouge_l" not in report.summary
|
||||
# bleu4 is -0.10, exceeds tolerance, so should appear
|
||||
assert "bleu4" in report.summary
|
||||
|
||||
def test_frozen_model(self) -> None:
|
||||
"""RegressionReport is immutable."""
|
||||
report = RegressionReport(
|
||||
detected=False,
|
||||
baseline={},
|
||||
current={},
|
||||
deltas={},
|
||||
tolerance=0.05,
|
||||
)
|
||||
|
||||
with pytest.raises(ValidationError):
|
||||
report.detected = True # type: ignore[misc]
|
||||
|
||||
def test_tolerance_in_summary(self) -> None:
|
||||
"""Summary includes tolerance threshold."""
|
||||
report = RegressionReport(
|
||||
detected=True,
|
||||
baseline={"metric": 0.80},
|
||||
current={"metric": 0.50},
|
||||
deltas={"metric": -0.30},
|
||||
tolerance=0.10,
|
||||
)
|
||||
|
||||
assert "10.00%" in report.summary
|
||||
Reference in New Issue
Block a user