Demonstrates benchmark quality tracking with historical comparison and CI integration using assert_no_regression() for exit code control.
161 lines
5.6 KiB
Python
161 lines
5.6 KiB
Python
"""Benchmark quality tracking with regression detection.
|
|
|
|
Demonstrates Veritext's benchmark module for CI integration:
|
|
- Creating a benchmark suite
|
|
- Running evaluations and storing results
|
|
- Checking for quality regression
|
|
- CI integration pattern with exit codes
|
|
"""
|
|
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
from veritext.benchmark import Benchmark
|
|
from veritext.core.exceptions import RegressionDetectedError
|
|
|
|
|
|
def create_sample_data() -> tuple[list[str], list[str]]:
|
|
"""Create sample candidate/reference pairs for benchmarking."""
|
|
# Simulated summarisation outputs and references
|
|
candidates = [
|
|
"The new policy aims to reduce carbon emissions by 50% by 2030.",
|
|
"Scientists discovered a new species of deep-sea fish.",
|
|
"The company reported record profits in the third quarter.",
|
|
"Researchers developed a breakthrough treatment for the disease.",
|
|
"The city plans to expand public transportation routes.",
|
|
]
|
|
references = [
|
|
"The policy targets a 50% reduction in carbon emissions by 2030.",
|
|
"A new deep-sea fish species was discovered by marine biologists.",
|
|
"Record profits were announced by the company for Q3.",
|
|
"A breakthrough disease treatment was developed by researchers.",
|
|
"Public transport expansion is planned for the city.",
|
|
]
|
|
return candidates, references
|
|
|
|
|
|
def run_benchmark_example() -> None:
|
|
"""Run a benchmark evaluation and view results."""
|
|
# Use a temp directory for this example
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
storage_path = Path(tmpdir) / "benchmarks"
|
|
|
|
# Create benchmark suite
|
|
bench = Benchmark("summariser_quality", storage_path=storage_path)
|
|
|
|
candidates, references = create_sample_data()
|
|
|
|
# Run evaluation
|
|
print("Running benchmark evaluation...")
|
|
run = bench.evaluate(
|
|
candidates=candidates,
|
|
references=references,
|
|
metrics=["rouge_l", "bleu4"],
|
|
metadata={"model": "v1.0", "dataset": "test"},
|
|
)
|
|
|
|
print("\nBenchmark run completed:")
|
|
print(f" Run ID: {run.id[:8]}...")
|
|
print(f" Samples: {run.sample_count}")
|
|
print(" Metrics:")
|
|
for name, value in run.metrics.items():
|
|
print(f" {name}: {value:.4f}")
|
|
|
|
|
|
def regression_detection_example() -> None:
|
|
"""Demonstrate regression detection with historical comparison."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
storage_path = Path(tmpdir) / "benchmarks"
|
|
bench = Benchmark("summariser_quality", storage_path=storage_path)
|
|
|
|
candidates, references = create_sample_data()
|
|
|
|
# Simulate historical runs with stable quality
|
|
print("\nBuilding baseline with historical runs...")
|
|
for i in range(5):
|
|
bench.evaluate(
|
|
candidates=candidates,
|
|
references=references,
|
|
metrics=["rouge_l", "bleu4"],
|
|
metadata={"run": f"baseline_{i}"},
|
|
)
|
|
print(f" Baseline run {i + 1} recorded")
|
|
|
|
# Check regression (no degradation expected)
|
|
report = bench.check_regression(tolerance=0.05, window=5)
|
|
print(f"\nRegression check: {'DETECTED' if report.detected else 'NONE'}")
|
|
|
|
# Simulate a degraded model
|
|
print("\nSimulating degraded model output...")
|
|
degraded_candidates = [
|
|
"Policy carbon emissions.", # Much shorter/worse
|
|
"Fish discovered.",
|
|
"Company profits.",
|
|
"Treatment developed.",
|
|
"Transport expansion.",
|
|
]
|
|
bench.evaluate(
|
|
candidates=degraded_candidates,
|
|
references=references,
|
|
metrics=["rouge_l", "bleu4"],
|
|
metadata={"model": "v1.1-broken"},
|
|
)
|
|
|
|
# Check regression (should detect)
|
|
report = bench.check_regression(tolerance=0.05, window=5)
|
|
print(f"Regression check: {'DETECTED' if report.detected else 'NONE'}")
|
|
if report.detected:
|
|
print("\nRegression details:")
|
|
for metric, delta in report.deltas.items():
|
|
baseline = report.baseline.get(metric, 0)
|
|
current = report.current.get(metric, 0)
|
|
print(f" {metric}: {baseline:.4f} -> {current:.4f} ({delta:+.4f})")
|
|
|
|
|
|
def ci_integration_example() -> None:
|
|
"""CI integration pattern using assert_no_regression()."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
storage_path = Path(tmpdir) / "benchmarks"
|
|
bench = Benchmark("ci_check", storage_path=storage_path)
|
|
|
|
candidates, references = create_sample_data()
|
|
|
|
# Build baseline
|
|
for _ in range(3):
|
|
bench.evaluate(candidates, references, metrics=["rouge_l"])
|
|
|
|
# Simulate CI check
|
|
print("\n" + "=" * 50)
|
|
print("CI Integration Example")
|
|
print("=" * 50)
|
|
|
|
print("\nRunning evaluation...")
|
|
bench.evaluate(candidates, references, metrics=["rouge_l"])
|
|
|
|
print("Checking for regression...")
|
|
try:
|
|
bench.assert_no_regression(tolerance=0.05, window=3)
|
|
print("No regression detected.")
|
|
print("CI status: EXIT 0")
|
|
except RegressionDetectedError as e:
|
|
print(f"Regression detected: {e}")
|
|
print("CI status: EXIT 1")
|
|
|
|
|
|
def main() -> None:
|
|
"""Run all benchmark examples."""
|
|
print("=" * 60)
|
|
print("Veritext Benchmark & Regression Detection Examples")
|
|
print("=" * 60)
|
|
|
|
run_benchmark_example()
|
|
regression_detection_example()
|
|
ci_integration_example()
|
|
|
|
print("\n" + "=" * 60)
|
|
print("All examples completed.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|