feat(benchmark): add BenchmarkRun and RegressionReport models

Data models for benchmark runs and regression reports using Pydantic.
This commit is contained in:
2026-02-03 18:09:43 +00:00
parent 6bafc43754
commit 45dfe07772

View File

@@ -0,0 +1,72 @@
"""Benchmark data models."""
from datetime import datetime
from typing import Any
from pydantic import BaseModel, ConfigDict, Field
class BenchmarkRun(BaseModel):
"""Record of a single benchmark execution."""
model_config = ConfigDict(frozen=True)
id: str
"""UUID for this run."""
benchmark_name: str
"""Name identifying this benchmark suite."""
timestamp: datetime
"""When the benchmark was executed."""
veritext_version: str
"""Version of veritext used."""
metrics: dict[str, float]
"""Metric results, e.g. {"rouge_l": 0.82, "bleu4": 0.71}."""
sample_count: int
"""Number of samples evaluated."""
metadata: dict[str, Any] = Field(default_factory=dict)
"""Optional metadata (git_sha, model version, etc.)."""
class RegressionReport(BaseModel):
"""Report comparing current run against baseline."""
model_config = ConfigDict(frozen=True)
detected: bool
"""Whether a regression was detected."""
baseline: dict[str, float]
"""Baseline metric values (rolling average)."""
current: dict[str, float]
"""Current run metric values."""
deltas: dict[str, float]
"""Difference from baseline (negative = regression)."""
tolerance: float
"""Tolerance threshold used for detection."""
@property
def summary(self) -> str:
"""Human-readable summary of the report."""
if not self.detected:
return "No regression detected. All metrics within tolerance."
regressions = [
f" {metric}: {self.current.get(metric, 0.0):.4f} "
f"(baseline: {self.baseline.get(metric, 0.0):.4f}, "
f"delta: {delta:+.4f})"
for metric, delta in self.deltas.items()
if delta < -self.tolerance
]
return f"Regression detected (tolerance: {self.tolerance:.2%}):\n" + "\n".join(
regressions
)