example: benchmark regression

Demonstrates benchmark quality tracking with historical comparison and
CI integration using assert_no_regression() for exit code control.
This commit is contained in:
2025-05-17 11:02:05 +00:00
parent 9cf968ad36
commit 0ea6adbbf4

View File

@@ -0,0 +1,160 @@
"""Benchmark quality tracking with regression detection.
Demonstrates Veritext's benchmark module for CI integration:
- Creating a benchmark suite
- Running evaluations and storing results
- Checking for quality regression
- CI integration pattern with exit codes
"""
import tempfile
from pathlib import Path
from veritext.benchmark import Benchmark
from veritext.core.exceptions import RegressionDetectedError
def create_sample_data() -> tuple[list[str], list[str]]:
"""Create sample candidate/reference pairs for benchmarking."""
# Simulated summarisation outputs and references
candidates = [
"The new policy aims to reduce carbon emissions by 50% by 2030.",
"Scientists discovered a new species of deep-sea fish.",
"The company reported record profits in the third quarter.",
"Researchers developed a breakthrough treatment for the disease.",
"The city plans to expand public transportation routes.",
]
references = [
"The policy targets a 50% reduction in carbon emissions by 2030.",
"A new deep-sea fish species was discovered by marine biologists.",
"Record profits were announced by the company for Q3.",
"A breakthrough disease treatment was developed by researchers.",
"Public transport expansion is planned for the city.",
]
return candidates, references
def run_benchmark_example() -> None:
"""Run a benchmark evaluation and view results."""
# Use a temp directory for this example
with tempfile.TemporaryDirectory() as tmpdir:
storage_path = Path(tmpdir) / "benchmarks"
# Create benchmark suite
bench = Benchmark("summariser_quality", storage_path=storage_path)
candidates, references = create_sample_data()
# Run evaluation
print("Running benchmark evaluation...")
run = bench.evaluate(
candidates=candidates,
references=references,
metrics=["rouge_l", "bleu4"],
metadata={"model": "v1.0", "dataset": "test"},
)
print("\nBenchmark run completed:")
print(f" Run ID: {run.id[:8]}...")
print(f" Samples: {run.sample_count}")
print(" Metrics:")
for name, value in run.metrics.items():
print(f" {name}: {value:.4f}")
def regression_detection_example() -> None:
"""Demonstrate regression detection with historical comparison."""
with tempfile.TemporaryDirectory() as tmpdir:
storage_path = Path(tmpdir) / "benchmarks"
bench = Benchmark("summariser_quality", storage_path=storage_path)
candidates, references = create_sample_data()
# Simulate historical runs with stable quality
print("\nBuilding baseline with historical runs...")
for i in range(5):
bench.evaluate(
candidates=candidates,
references=references,
metrics=["rouge_l", "bleu4"],
metadata={"run": f"baseline_{i}"},
)
print(f" Baseline run {i + 1} recorded")
# Check regression (no degradation expected)
report = bench.check_regression(tolerance=0.05, window=5)
print(f"\nRegression check: {'DETECTED' if report.detected else 'NONE'}")
# Simulate a degraded model
print("\nSimulating degraded model output...")
degraded_candidates = [
"Policy carbon emissions.", # Much shorter/worse
"Fish discovered.",
"Company profits.",
"Treatment developed.",
"Transport expansion.",
]
bench.evaluate(
candidates=degraded_candidates,
references=references,
metrics=["rouge_l", "bleu4"],
metadata={"model": "v1.1-broken"},
)
# Check regression (should detect)
report = bench.check_regression(tolerance=0.05, window=5)
print(f"Regression check: {'DETECTED' if report.detected else 'NONE'}")
if report.detected:
print("\nRegression details:")
for metric, delta in report.deltas.items():
baseline = report.baseline.get(metric, 0)
current = report.current.get(metric, 0)
print(f" {metric}: {baseline:.4f} -> {current:.4f} ({delta:+.4f})")
def ci_integration_example() -> None:
"""CI integration pattern using assert_no_regression()."""
with tempfile.TemporaryDirectory() as tmpdir:
storage_path = Path(tmpdir) / "benchmarks"
bench = Benchmark("ci_check", storage_path=storage_path)
candidates, references = create_sample_data()
# Build baseline
for _ in range(3):
bench.evaluate(candidates, references, metrics=["rouge_l"])
# Simulate CI check
print("\n" + "=" * 50)
print("CI Integration Example")
print("=" * 50)
print("\nRunning evaluation...")
bench.evaluate(candidates, references, metrics=["rouge_l"])
print("Checking for regression...")
try:
bench.assert_no_regression(tolerance=0.05, window=3)
print("No regression detected.")
print("CI status: EXIT 0")
except RegressionDetectedError as e:
print(f"Regression detected: {e}")
print("CI status: EXIT 1")
def main() -> None:
"""Run all benchmark examples."""
print("=" * 60)
print("Veritext Benchmark & Regression Detection Examples")
print("=" * 60)
run_benchmark_example()
regression_detection_example()
ci_integration_example()
print("\n" + "=" * 60)
print("All examples completed.")
if __name__ == "__main__":
main()