feat(benchmark): add BenchmarkRun and RegressionReport models
Data models for benchmark runs and regression reports using Pydantic.
This commit is contained in:
72
src/veritext/benchmark/models.py
Normal file
72
src/veritext/benchmark/models.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
"""Benchmark data models."""
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from pydantic import BaseModel, ConfigDict, Field
|
||||||
|
|
||||||
|
|
||||||
|
class BenchmarkRun(BaseModel):
|
||||||
|
"""Record of a single benchmark execution."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(frozen=True)
|
||||||
|
|
||||||
|
id: str
|
||||||
|
"""UUID for this run."""
|
||||||
|
|
||||||
|
benchmark_name: str
|
||||||
|
"""Name identifying this benchmark suite."""
|
||||||
|
|
||||||
|
timestamp: datetime
|
||||||
|
"""When the benchmark was executed."""
|
||||||
|
|
||||||
|
veritext_version: str
|
||||||
|
"""Version of veritext used."""
|
||||||
|
|
||||||
|
metrics: dict[str, float]
|
||||||
|
"""Metric results, e.g. {"rouge_l": 0.82, "bleu4": 0.71}."""
|
||||||
|
|
||||||
|
sample_count: int
|
||||||
|
"""Number of samples evaluated."""
|
||||||
|
|
||||||
|
metadata: dict[str, Any] = Field(default_factory=dict)
|
||||||
|
"""Optional metadata (git_sha, model version, etc.)."""
|
||||||
|
|
||||||
|
|
||||||
|
class RegressionReport(BaseModel):
|
||||||
|
"""Report comparing current run against baseline."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(frozen=True)
|
||||||
|
|
||||||
|
detected: bool
|
||||||
|
"""Whether a regression was detected."""
|
||||||
|
|
||||||
|
baseline: dict[str, float]
|
||||||
|
"""Baseline metric values (rolling average)."""
|
||||||
|
|
||||||
|
current: dict[str, float]
|
||||||
|
"""Current run metric values."""
|
||||||
|
|
||||||
|
deltas: dict[str, float]
|
||||||
|
"""Difference from baseline (negative = regression)."""
|
||||||
|
|
||||||
|
tolerance: float
|
||||||
|
"""Tolerance threshold used for detection."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def summary(self) -> str:
|
||||||
|
"""Human-readable summary of the report."""
|
||||||
|
if not self.detected:
|
||||||
|
return "No regression detected. All metrics within tolerance."
|
||||||
|
|
||||||
|
regressions = [
|
||||||
|
f" {metric}: {self.current.get(metric, 0.0):.4f} "
|
||||||
|
f"(baseline: {self.baseline.get(metric, 0.0):.4f}, "
|
||||||
|
f"delta: {delta:+.4f})"
|
||||||
|
for metric, delta in self.deltas.items()
|
||||||
|
if delta < -self.tolerance
|
||||||
|
]
|
||||||
|
|
||||||
|
return f"Regression detected (tolerance: {self.tolerance:.2%}):\n" + "\n".join(
|
||||||
|
regressions
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user