example: benchmark regression
Demonstrates benchmark quality tracking with historical comparison and CI integration using assert_no_regression() for exit code control.
This commit is contained in:
160
examples/benchmark_regression.py
Normal file
160
examples/benchmark_regression.py
Normal file
@@ -0,0 +1,160 @@
|
||||
"""Benchmark quality tracking with regression detection.
|
||||
|
||||
Demonstrates Veritext's benchmark module for CI integration:
|
||||
- Creating a benchmark suite
|
||||
- Running evaluations and storing results
|
||||
- Checking for quality regression
|
||||
- CI integration pattern with exit codes
|
||||
"""
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from veritext.benchmark import Benchmark
|
||||
from veritext.core.exceptions import RegressionDetectedError
|
||||
|
||||
|
||||
def create_sample_data() -> tuple[list[str], list[str]]:
|
||||
"""Create sample candidate/reference pairs for benchmarking."""
|
||||
# Simulated summarisation outputs and references
|
||||
candidates = [
|
||||
"The new policy aims to reduce carbon emissions by 50% by 2030.",
|
||||
"Scientists discovered a new species of deep-sea fish.",
|
||||
"The company reported record profits in the third quarter.",
|
||||
"Researchers developed a breakthrough treatment for the disease.",
|
||||
"The city plans to expand public transportation routes.",
|
||||
]
|
||||
references = [
|
||||
"The policy targets a 50% reduction in carbon emissions by 2030.",
|
||||
"A new deep-sea fish species was discovered by marine biologists.",
|
||||
"Record profits were announced by the company for Q3.",
|
||||
"A breakthrough disease treatment was developed by researchers.",
|
||||
"Public transport expansion is planned for the city.",
|
||||
]
|
||||
return candidates, references
|
||||
|
||||
|
||||
def run_benchmark_example() -> None:
|
||||
"""Run a benchmark evaluation and view results."""
|
||||
# Use a temp directory for this example
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
storage_path = Path(tmpdir) / "benchmarks"
|
||||
|
||||
# Create benchmark suite
|
||||
bench = Benchmark("summariser_quality", storage_path=storage_path)
|
||||
|
||||
candidates, references = create_sample_data()
|
||||
|
||||
# Run evaluation
|
||||
print("Running benchmark evaluation...")
|
||||
run = bench.evaluate(
|
||||
candidates=candidates,
|
||||
references=references,
|
||||
metrics=["rouge_l", "bleu4"],
|
||||
metadata={"model": "v1.0", "dataset": "test"},
|
||||
)
|
||||
|
||||
print("\nBenchmark run completed:")
|
||||
print(f" Run ID: {run.id[:8]}...")
|
||||
print(f" Samples: {run.sample_count}")
|
||||
print(" Metrics:")
|
||||
for name, value in run.metrics.items():
|
||||
print(f" {name}: {value:.4f}")
|
||||
|
||||
|
||||
def regression_detection_example() -> None:
|
||||
"""Demonstrate regression detection with historical comparison."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
storage_path = Path(tmpdir) / "benchmarks"
|
||||
bench = Benchmark("summariser_quality", storage_path=storage_path)
|
||||
|
||||
candidates, references = create_sample_data()
|
||||
|
||||
# Simulate historical runs with stable quality
|
||||
print("\nBuilding baseline with historical runs...")
|
||||
for i in range(5):
|
||||
bench.evaluate(
|
||||
candidates=candidates,
|
||||
references=references,
|
||||
metrics=["rouge_l", "bleu4"],
|
||||
metadata={"run": f"baseline_{i}"},
|
||||
)
|
||||
print(f" Baseline run {i + 1} recorded")
|
||||
|
||||
# Check regression (no degradation expected)
|
||||
report = bench.check_regression(tolerance=0.05, window=5)
|
||||
print(f"\nRegression check: {'DETECTED' if report.detected else 'NONE'}")
|
||||
|
||||
# Simulate a degraded model
|
||||
print("\nSimulating degraded model output...")
|
||||
degraded_candidates = [
|
||||
"Policy carbon emissions.", # Much shorter/worse
|
||||
"Fish discovered.",
|
||||
"Company profits.",
|
||||
"Treatment developed.",
|
||||
"Transport expansion.",
|
||||
]
|
||||
bench.evaluate(
|
||||
candidates=degraded_candidates,
|
||||
references=references,
|
||||
metrics=["rouge_l", "bleu4"],
|
||||
metadata={"model": "v1.1-broken"},
|
||||
)
|
||||
|
||||
# Check regression (should detect)
|
||||
report = bench.check_regression(tolerance=0.05, window=5)
|
||||
print(f"Regression check: {'DETECTED' if report.detected else 'NONE'}")
|
||||
if report.detected:
|
||||
print("\nRegression details:")
|
||||
for metric, delta in report.deltas.items():
|
||||
baseline = report.baseline.get(metric, 0)
|
||||
current = report.current.get(metric, 0)
|
||||
print(f" {metric}: {baseline:.4f} -> {current:.4f} ({delta:+.4f})")
|
||||
|
||||
|
||||
def ci_integration_example() -> None:
|
||||
"""CI integration pattern using assert_no_regression()."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
storage_path = Path(tmpdir) / "benchmarks"
|
||||
bench = Benchmark("ci_check", storage_path=storage_path)
|
||||
|
||||
candidates, references = create_sample_data()
|
||||
|
||||
# Build baseline
|
||||
for _ in range(3):
|
||||
bench.evaluate(candidates, references, metrics=["rouge_l"])
|
||||
|
||||
# Simulate CI check
|
||||
print("\n" + "=" * 50)
|
||||
print("CI Integration Example")
|
||||
print("=" * 50)
|
||||
|
||||
print("\nRunning evaluation...")
|
||||
bench.evaluate(candidates, references, metrics=["rouge_l"])
|
||||
|
||||
print("Checking for regression...")
|
||||
try:
|
||||
bench.assert_no_regression(tolerance=0.05, window=3)
|
||||
print("No regression detected.")
|
||||
print("CI status: EXIT 0")
|
||||
except RegressionDetectedError as e:
|
||||
print(f"Regression detected: {e}")
|
||||
print("CI status: EXIT 1")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Run all benchmark examples."""
|
||||
print("=" * 60)
|
||||
print("Veritext Benchmark & Regression Detection Examples")
|
||||
print("=" * 60)
|
||||
|
||||
run_benchmark_example()
|
||||
regression_detection_example()
|
||||
ci_integration_example()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("All examples completed.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user