"""Benchmark quality tracking with regression detection. Demonstrates Veritext's benchmark module for CI integration: - Creating a benchmark suite - Running evaluations and storing results - Checking for quality regression - CI integration pattern with exit codes """ import tempfile from pathlib import Path from veritext.benchmark import Benchmark from veritext.core.exceptions import RegressionDetectedError def create_sample_data() -> tuple[list[str], list[str]]: """Create sample candidate/reference pairs for benchmarking.""" # Simulated summarisation outputs and references candidates = [ "The new policy aims to reduce carbon emissions by 50% by 2030.", "Scientists discovered a new species of deep-sea fish.", "The company reported record profits in the third quarter.", "Researchers developed a breakthrough treatment for the disease.", "The city plans to expand public transportation routes.", ] references = [ "The policy targets a 50% reduction in carbon emissions by 2030.", "A new deep-sea fish species was discovered by marine biologists.", "Record profits were announced by the company for Q3.", "A breakthrough disease treatment was developed by researchers.", "Public transport expansion is planned for the city.", ] return candidates, references def run_benchmark_example() -> None: """Run a benchmark evaluation and view results.""" # Use a temp directory for this example with tempfile.TemporaryDirectory() as tmpdir: storage_path = Path(tmpdir) / "benchmarks" # Create benchmark suite bench = Benchmark("summariser_quality", storage_path=storage_path) candidates, references = create_sample_data() # Run evaluation print("Running benchmark evaluation...") run = bench.evaluate( candidates=candidates, references=references, metrics=["rouge_l", "bleu4"], metadata={"model": "v1.0", "dataset": "test"}, ) print("\nBenchmark run completed:") print(f" Run ID: {run.id[:8]}...") print(f" Samples: {run.sample_count}") print(" Metrics:") for name, value in run.metrics.items(): print(f" {name}: {value:.4f}") def regression_detection_example() -> None: """Demonstrate regression detection with historical comparison.""" with tempfile.TemporaryDirectory() as tmpdir: storage_path = Path(tmpdir) / "benchmarks" bench = Benchmark("summariser_quality", storage_path=storage_path) candidates, references = create_sample_data() # Simulate historical runs with stable quality print("\nBuilding baseline with historical runs...") for i in range(5): bench.evaluate( candidates=candidates, references=references, metrics=["rouge_l", "bleu4"], metadata={"run": f"baseline_{i}"}, ) print(f" Baseline run {i + 1} recorded") # Check regression (no degradation expected) report = bench.check_regression(tolerance=0.05, window=5) print(f"\nRegression check: {'DETECTED' if report.detected else 'NONE'}") # Simulate a degraded model print("\nSimulating degraded model output...") degraded_candidates = [ "Policy carbon emissions.", # Much shorter/worse "Fish discovered.", "Company profits.", "Treatment developed.", "Transport expansion.", ] bench.evaluate( candidates=degraded_candidates, references=references, metrics=["rouge_l", "bleu4"], metadata={"model": "v1.1-broken"}, ) # Check regression (should detect) report = bench.check_regression(tolerance=0.05, window=5) print(f"Regression check: {'DETECTED' if report.detected else 'NONE'}") if report.detected: print("\nRegression details:") for metric, delta in report.deltas.items(): baseline = report.baseline.get(metric, 0) current = report.current.get(metric, 0) print(f" {metric}: {baseline:.4f} -> {current:.4f} ({delta:+.4f})") def ci_integration_example() -> None: """CI integration pattern using assert_no_regression().""" with tempfile.TemporaryDirectory() as tmpdir: storage_path = Path(tmpdir) / "benchmarks" bench = Benchmark("ci_check", storage_path=storage_path) candidates, references = create_sample_data() # Build baseline for _ in range(3): bench.evaluate(candidates, references, metrics=["rouge_l"]) # Simulate CI check print("\n" + "=" * 50) print("CI Integration Example") print("=" * 50) print("\nRunning evaluation...") bench.evaluate(candidates, references, metrics=["rouge_l"]) print("Checking for regression...") try: bench.assert_no_regression(tolerance=0.05, window=3) print("No regression detected.") print("CI status: EXIT 0") except RegressionDetectedError as e: print(f"Regression detected: {e}") print("CI status: EXIT 1") def main() -> None: """Run all benchmark examples.""" print("=" * 60) print("Veritext Benchmark & Regression Detection Examples") print("=" * 60) run_benchmark_example() regression_detection_example() ci_integration_example() print("\n" + "=" * 60) print("All examples completed.") if __name__ == "__main__": main()