diff --git a/examples/benchmark_regression.py b/examples/benchmark_regression.py new file mode 100644 index 0000000..4d34b9f --- /dev/null +++ b/examples/benchmark_regression.py @@ -0,0 +1,160 @@ +"""Benchmark quality tracking with regression detection. + +Demonstrates Veritext's benchmark module for CI integration: +- Creating a benchmark suite +- Running evaluations and storing results +- Checking for quality regression +- CI integration pattern with exit codes +""" + +import tempfile +from pathlib import Path + +from veritext.benchmark import Benchmark +from veritext.core.exceptions import RegressionDetectedError + + +def create_sample_data() -> tuple[list[str], list[str]]: + """Create sample candidate/reference pairs for benchmarking.""" + # Simulated summarisation outputs and references + candidates = [ + "The new policy aims to reduce carbon emissions by 50% by 2030.", + "Scientists discovered a new species of deep-sea fish.", + "The company reported record profits in the third quarter.", + "Researchers developed a breakthrough treatment for the disease.", + "The city plans to expand public transportation routes.", + ] + references = [ + "The policy targets a 50% reduction in carbon emissions by 2030.", + "A new deep-sea fish species was discovered by marine biologists.", + "Record profits were announced by the company for Q3.", + "A breakthrough disease treatment was developed by researchers.", + "Public transport expansion is planned for the city.", + ] + return candidates, references + + +def run_benchmark_example() -> None: + """Run a benchmark evaluation and view results.""" + # Use a temp directory for this example + with tempfile.TemporaryDirectory() as tmpdir: + storage_path = Path(tmpdir) / "benchmarks" + + # Create benchmark suite + bench = Benchmark("summariser_quality", storage_path=storage_path) + + candidates, references = create_sample_data() + + # Run evaluation + print("Running benchmark evaluation...") + run = bench.evaluate( + candidates=candidates, + references=references, + metrics=["rouge_l", "bleu4"], + metadata={"model": "v1.0", "dataset": "test"}, + ) + + print("\nBenchmark run completed:") + print(f" Run ID: {run.id[:8]}...") + print(f" Samples: {run.sample_count}") + print(" Metrics:") + for name, value in run.metrics.items(): + print(f" {name}: {value:.4f}") + + +def regression_detection_example() -> None: + """Demonstrate regression detection with historical comparison.""" + with tempfile.TemporaryDirectory() as tmpdir: + storage_path = Path(tmpdir) / "benchmarks" + bench = Benchmark("summariser_quality", storage_path=storage_path) + + candidates, references = create_sample_data() + + # Simulate historical runs with stable quality + print("\nBuilding baseline with historical runs...") + for i in range(5): + bench.evaluate( + candidates=candidates, + references=references, + metrics=["rouge_l", "bleu4"], + metadata={"run": f"baseline_{i}"}, + ) + print(f" Baseline run {i + 1} recorded") + + # Check regression (no degradation expected) + report = bench.check_regression(tolerance=0.05, window=5) + print(f"\nRegression check: {'DETECTED' if report.detected else 'NONE'}") + + # Simulate a degraded model + print("\nSimulating degraded model output...") + degraded_candidates = [ + "Policy carbon emissions.", # Much shorter/worse + "Fish discovered.", + "Company profits.", + "Treatment developed.", + "Transport expansion.", + ] + bench.evaluate( + candidates=degraded_candidates, + references=references, + metrics=["rouge_l", "bleu4"], + metadata={"model": "v1.1-broken"}, + ) + + # Check regression (should detect) + report = bench.check_regression(tolerance=0.05, window=5) + print(f"Regression check: {'DETECTED' if report.detected else 'NONE'}") + if report.detected: + print("\nRegression details:") + for metric, delta in report.deltas.items(): + baseline = report.baseline.get(metric, 0) + current = report.current.get(metric, 0) + print(f" {metric}: {baseline:.4f} -> {current:.4f} ({delta:+.4f})") + + +def ci_integration_example() -> None: + """CI integration pattern using assert_no_regression().""" + with tempfile.TemporaryDirectory() as tmpdir: + storage_path = Path(tmpdir) / "benchmarks" + bench = Benchmark("ci_check", storage_path=storage_path) + + candidates, references = create_sample_data() + + # Build baseline + for _ in range(3): + bench.evaluate(candidates, references, metrics=["rouge_l"]) + + # Simulate CI check + print("\n" + "=" * 50) + print("CI Integration Example") + print("=" * 50) + + print("\nRunning evaluation...") + bench.evaluate(candidates, references, metrics=["rouge_l"]) + + print("Checking for regression...") + try: + bench.assert_no_regression(tolerance=0.05, window=3) + print("No regression detected.") + print("CI status: EXIT 0") + except RegressionDetectedError as e: + print(f"Regression detected: {e}") + print("CI status: EXIT 1") + + +def main() -> None: + """Run all benchmark examples.""" + print("=" * 60) + print("Veritext Benchmark & Regression Detection Examples") + print("=" * 60) + + run_benchmark_example() + regression_detection_example() + ci_integration_example() + + print("\n" + "=" * 60) + print("All examples completed.") + + +if __name__ == "__main__": + main()