feat(benchmark): add BenchmarkRun and RegressionReport models
Data models for benchmark runs and regression reports using Pydantic.
This commit is contained in:
72
src/veritext/benchmark/models.py
Normal file
72
src/veritext/benchmark/models.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""Benchmark data models."""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
|
||||
class BenchmarkRun(BaseModel):
|
||||
"""Record of a single benchmark execution."""
|
||||
|
||||
model_config = ConfigDict(frozen=True)
|
||||
|
||||
id: str
|
||||
"""UUID for this run."""
|
||||
|
||||
benchmark_name: str
|
||||
"""Name identifying this benchmark suite."""
|
||||
|
||||
timestamp: datetime
|
||||
"""When the benchmark was executed."""
|
||||
|
||||
veritext_version: str
|
||||
"""Version of veritext used."""
|
||||
|
||||
metrics: dict[str, float]
|
||||
"""Metric results, e.g. {"rouge_l": 0.82, "bleu4": 0.71}."""
|
||||
|
||||
sample_count: int
|
||||
"""Number of samples evaluated."""
|
||||
|
||||
metadata: dict[str, Any] = Field(default_factory=dict)
|
||||
"""Optional metadata (git_sha, model version, etc.)."""
|
||||
|
||||
|
||||
class RegressionReport(BaseModel):
|
||||
"""Report comparing current run against baseline."""
|
||||
|
||||
model_config = ConfigDict(frozen=True)
|
||||
|
||||
detected: bool
|
||||
"""Whether a regression was detected."""
|
||||
|
||||
baseline: dict[str, float]
|
||||
"""Baseline metric values (rolling average)."""
|
||||
|
||||
current: dict[str, float]
|
||||
"""Current run metric values."""
|
||||
|
||||
deltas: dict[str, float]
|
||||
"""Difference from baseline (negative = regression)."""
|
||||
|
||||
tolerance: float
|
||||
"""Tolerance threshold used for detection."""
|
||||
|
||||
@property
|
||||
def summary(self) -> str:
|
||||
"""Human-readable summary of the report."""
|
||||
if not self.detected:
|
||||
return "No regression detected. All metrics within tolerance."
|
||||
|
||||
regressions = [
|
||||
f" {metric}: {self.current.get(metric, 0.0):.4f} "
|
||||
f"(baseline: {self.baseline.get(metric, 0.0):.4f}, "
|
||||
f"delta: {delta:+.4f})"
|
||||
for metric, delta in self.deltas.items()
|
||||
if delta < -self.tolerance
|
||||
]
|
||||
|
||||
return f"Regression detected (tolerance: {self.tolerance:.2%}):\n" + "\n".join(
|
||||
regressions
|
||||
)
|
||||
Reference in New Issue
Block a user