146 lines
4.8 KiB
Python
146 lines
4.8 KiB
Python
"""Tests for benchmark data models."""
|
|
|
|
from datetime import UTC, datetime
|
|
|
|
import pytest
|
|
from pydantic import ValidationError
|
|
|
|
from veritext.benchmark.models import BenchmarkRun, RegressionReport
|
|
|
|
|
|
class TestBenchmarkRun:
|
|
"""Tests for BenchmarkRun model."""
|
|
|
|
def test_create_benchmark_run(self) -> None:
|
|
"""BenchmarkRun can be created with required fields."""
|
|
run = BenchmarkRun(
|
|
id="test-id-123",
|
|
benchmark_name="test-benchmark",
|
|
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
|
|
veritext_version="0.1.0-dev",
|
|
metrics={"bleu4": 0.75, "rouge_l": 0.82},
|
|
sample_count=100,
|
|
)
|
|
|
|
assert run.id == "test-id-123"
|
|
assert run.benchmark_name == "test-benchmark"
|
|
assert run.veritext_version == "0.1.0-dev"
|
|
assert run.metrics == {"bleu4": 0.75, "rouge_l": 0.82}
|
|
assert run.sample_count == 100
|
|
assert run.metadata == {}
|
|
|
|
def test_create_with_metadata(self) -> None:
|
|
"""BenchmarkRun can include optional metadata."""
|
|
run = BenchmarkRun(
|
|
id="test-id-456",
|
|
benchmark_name="test-benchmark",
|
|
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
|
|
veritext_version="0.1.0-dev",
|
|
metrics={"bleu4": 0.75},
|
|
sample_count=50,
|
|
metadata={"git_sha": "abc123", "model_version": "gpt-4"},
|
|
)
|
|
|
|
assert run.metadata == {"git_sha": "abc123", "model_version": "gpt-4"}
|
|
|
|
def test_frozen_model(self) -> None:
|
|
"""BenchmarkRun is immutable."""
|
|
run = BenchmarkRun(
|
|
id="test-id",
|
|
benchmark_name="test",
|
|
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
|
|
veritext_version="0.1.0",
|
|
metrics={"bleu4": 0.5},
|
|
sample_count=10,
|
|
)
|
|
|
|
with pytest.raises(ValidationError):
|
|
run.id = "new-id" # type: ignore[misc]
|
|
|
|
def test_serialisation(self) -> None:
|
|
"""BenchmarkRun can be serialised to dict."""
|
|
run = BenchmarkRun(
|
|
id="test-id",
|
|
benchmark_name="test",
|
|
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
|
|
veritext_version="0.1.0",
|
|
metrics={"bleu4": 0.5},
|
|
sample_count=10,
|
|
)
|
|
|
|
data = run.model_dump()
|
|
assert data["id"] == "test-id"
|
|
assert data["benchmark_name"] == "test"
|
|
assert data["metrics"] == {"bleu4": 0.5}
|
|
|
|
|
|
class TestRegressionReport:
|
|
"""Tests for RegressionReport model."""
|
|
|
|
def test_no_regression_summary(self) -> None:
|
|
"""Summary indicates no regression when detected is False."""
|
|
report = RegressionReport(
|
|
detected=False,
|
|
baseline={"bleu4": 0.75, "rouge_l": 0.80},
|
|
current={"bleu4": 0.76, "rouge_l": 0.81},
|
|
deltas={"bleu4": 0.01, "rouge_l": 0.01},
|
|
tolerance=0.05,
|
|
)
|
|
|
|
assert "No regression detected" in report.summary
|
|
|
|
def test_regression_summary(self) -> None:
|
|
"""Summary lists regressed metrics when detected is True."""
|
|
report = RegressionReport(
|
|
detected=True,
|
|
baseline={"bleu4": 0.75, "rouge_l": 0.80},
|
|
current={"bleu4": 0.65, "rouge_l": 0.78},
|
|
deltas={"bleu4": -0.10, "rouge_l": -0.02},
|
|
tolerance=0.05,
|
|
)
|
|
|
|
assert "Regression detected" in report.summary
|
|
assert "bleu4" in report.summary
|
|
assert "0.6500" in report.summary
|
|
assert "baseline: 0.7500" in report.summary
|
|
|
|
def test_regression_excludes_within_tolerance(self) -> None:
|
|
"""Summary only shows metrics that exceed tolerance."""
|
|
report = RegressionReport(
|
|
detected=True,
|
|
baseline={"bleu4": 0.75, "rouge_l": 0.80},
|
|
current={"bleu4": 0.65, "rouge_l": 0.78},
|
|
deltas={"bleu4": -0.10, "rouge_l": -0.02},
|
|
tolerance=0.05,
|
|
)
|
|
|
|
# rouge_l is -0.02, within tolerance of 0.05, so shouldn't appear
|
|
assert "rouge_l" not in report.summary
|
|
# bleu4 is -0.10, exceeds tolerance, so should appear
|
|
assert "bleu4" in report.summary
|
|
|
|
def test_frozen_model(self) -> None:
|
|
"""RegressionReport is immutable."""
|
|
report = RegressionReport(
|
|
detected=False,
|
|
baseline={},
|
|
current={},
|
|
deltas={},
|
|
tolerance=0.05,
|
|
)
|
|
|
|
with pytest.raises(ValidationError):
|
|
report.detected = True # type: ignore[misc]
|
|
|
|
def test_tolerance_in_summary(self) -> None:
|
|
"""Summary includes tolerance threshold."""
|
|
report = RegressionReport(
|
|
detected=True,
|
|
baseline={"metric": 0.80},
|
|
current={"metric": 0.50},
|
|
deltas={"metric": -0.30},
|
|
tolerance=0.10,
|
|
)
|
|
|
|
assert "10.00%" in report.summary
|