"""Tests for benchmark data models.""" from datetime import UTC, datetime import pytest from pydantic import ValidationError from veritext.benchmark.models import BenchmarkRun, RegressionReport class TestBenchmarkRun: """Tests for BenchmarkRun model.""" def test_create_benchmark_run(self) -> None: """BenchmarkRun can be created with required fields.""" run = BenchmarkRun( id="test-id-123", benchmark_name="test-benchmark", timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC), veritext_version="0.1.0-dev", metrics={"bleu4": 0.75, "rouge_l": 0.82}, sample_count=100, ) assert run.id == "test-id-123" assert run.benchmark_name == "test-benchmark" assert run.veritext_version == "0.1.0-dev" assert run.metrics == {"bleu4": 0.75, "rouge_l": 0.82} assert run.sample_count == 100 assert run.metadata == {} def test_create_with_metadata(self) -> None: """BenchmarkRun can include optional metadata.""" run = BenchmarkRun( id="test-id-456", benchmark_name="test-benchmark", timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC), veritext_version="0.1.0-dev", metrics={"bleu4": 0.75}, sample_count=50, metadata={"git_sha": "abc123", "model_version": "gpt-4"}, ) assert run.metadata == {"git_sha": "abc123", "model_version": "gpt-4"} def test_frozen_model(self) -> None: """BenchmarkRun is immutable.""" run = BenchmarkRun( id="test-id", benchmark_name="test", timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC), veritext_version="0.1.0", metrics={"bleu4": 0.5}, sample_count=10, ) with pytest.raises(ValidationError): run.id = "new-id" # type: ignore[misc] def test_serialisation(self) -> None: """BenchmarkRun can be serialised to dict.""" run = BenchmarkRun( id="test-id", benchmark_name="test", timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC), veritext_version="0.1.0", metrics={"bleu4": 0.5}, sample_count=10, ) data = run.model_dump() assert data["id"] == "test-id" assert data["benchmark_name"] == "test" assert data["metrics"] == {"bleu4": 0.5} class TestRegressionReport: """Tests for RegressionReport model.""" def test_no_regression_summary(self) -> None: """Summary indicates no regression when detected is False.""" report = RegressionReport( detected=False, baseline={"bleu4": 0.75, "rouge_l": 0.80}, current={"bleu4": 0.76, "rouge_l": 0.81}, deltas={"bleu4": 0.01, "rouge_l": 0.01}, tolerance=0.05, ) assert "No regression detected" in report.summary def test_regression_summary(self) -> None: """Summary lists regressed metrics when detected is True.""" report = RegressionReport( detected=True, baseline={"bleu4": 0.75, "rouge_l": 0.80}, current={"bleu4": 0.65, "rouge_l": 0.78}, deltas={"bleu4": -0.10, "rouge_l": -0.02}, tolerance=0.05, ) assert "Regression detected" in report.summary assert "bleu4" in report.summary assert "0.6500" in report.summary assert "baseline: 0.7500" in report.summary def test_regression_excludes_within_tolerance(self) -> None: """Summary only shows metrics that exceed tolerance.""" report = RegressionReport( detected=True, baseline={"bleu4": 0.75, "rouge_l": 0.80}, current={"bleu4": 0.65, "rouge_l": 0.78}, deltas={"bleu4": -0.10, "rouge_l": -0.02}, tolerance=0.05, ) # rouge_l is -0.02, within tolerance of 0.05, so shouldn't appear assert "rouge_l" not in report.summary # bleu4 is -0.10, exceeds tolerance, so should appear assert "bleu4" in report.summary def test_frozen_model(self) -> None: """RegressionReport is immutable.""" report = RegressionReport( detected=False, baseline={}, current={}, deltas={}, tolerance=0.05, ) with pytest.raises(ValidationError): report.detected = True # type: ignore[misc] def test_tolerance_in_summary(self) -> None: """Summary includes tolerance threshold.""" report = RegressionReport( detected=True, baseline={"metric": 0.80}, current={"metric": 0.50}, deltas={"metric": -0.30}, tolerance=0.10, ) assert "10.00%" in report.summary