diff --git a/src/veritext/benchmark/models.py b/src/veritext/benchmark/models.py new file mode 100644 index 0000000..0ec6181 --- /dev/null +++ b/src/veritext/benchmark/models.py @@ -0,0 +1,72 @@ +"""Benchmark data models.""" + +from datetime import datetime +from typing import Any + +from pydantic import BaseModel, ConfigDict, Field + + +class BenchmarkRun(BaseModel): + """Record of a single benchmark execution.""" + + model_config = ConfigDict(frozen=True) + + id: str + """UUID for this run.""" + + benchmark_name: str + """Name identifying this benchmark suite.""" + + timestamp: datetime + """When the benchmark was executed.""" + + veritext_version: str + """Version of veritext used.""" + + metrics: dict[str, float] + """Metric results, e.g. {"rouge_l": 0.82, "bleu4": 0.71}.""" + + sample_count: int + """Number of samples evaluated.""" + + metadata: dict[str, Any] = Field(default_factory=dict) + """Optional metadata (git_sha, model version, etc.).""" + + +class RegressionReport(BaseModel): + """Report comparing current run against baseline.""" + + model_config = ConfigDict(frozen=True) + + detected: bool + """Whether a regression was detected.""" + + baseline: dict[str, float] + """Baseline metric values (rolling average).""" + + current: dict[str, float] + """Current run metric values.""" + + deltas: dict[str, float] + """Difference from baseline (negative = regression).""" + + tolerance: float + """Tolerance threshold used for detection.""" + + @property + def summary(self) -> str: + """Human-readable summary of the report.""" + if not self.detected: + return "No regression detected. All metrics within tolerance." + + regressions = [ + f" {metric}: {self.current.get(metric, 0.0):.4f} " + f"(baseline: {self.baseline.get(metric, 0.0):.4f}, " + f"delta: {delta:+.4f})" + for metric, delta in self.deltas.items() + if delta < -self.tolerance + ] + + return f"Regression detected (tolerance: {self.tolerance:.2%}):\n" + "\n".join( + regressions + )