example: benchmark regression

Demonstrates benchmark quality tracking with historical comparison and CI integration using assert_no_regression() for exit code control.
2025-05-17 11:02:05 +00:00
parent 9cf968ad36
commit 0ea6adbbf4
1 changed files with 160 additions and 0 deletions
@@ -0,0 +1,160 @@
+"""Benchmark quality tracking with regression detection.
+
+Demonstrates Veritext's benchmark module for CI integration:
+- Creating a benchmark suite
+- Running evaluations and storing results
+- Checking for quality regression
+- CI integration pattern with exit codes
+"""
+
+import tempfile
+from pathlib import Path
+
+from veritext.benchmark import Benchmark
+from veritext.core.exceptions import RegressionDetectedError
+
+
+def create_sample_data() -> tuple[list[str], list[str]]:
+    """Create sample candidate/reference pairs for benchmarking."""
+    # Simulated summarisation outputs and references
+    candidates = [
+        "The new policy aims to reduce carbon emissions by 50% by 2030.",
+        "Scientists discovered a new species of deep-sea fish.",
+        "The company reported record profits in the third quarter.",
+        "Researchers developed a breakthrough treatment for the disease.",
+        "The city plans to expand public transportation routes.",
+    ]
+    references = [
+        "The policy targets a 50% reduction in carbon emissions by 2030.",
+        "A new deep-sea fish species was discovered by marine biologists.",
+        "Record profits were announced by the company for Q3.",
+        "A breakthrough disease treatment was developed by researchers.",
+        "Public transport expansion is planned for the city.",
+    ]
+    return candidates, references
+
+
+def run_benchmark_example() -> None:
+    """Run a benchmark evaluation and view results."""
+    # Use a temp directory for this example
+    with tempfile.TemporaryDirectory() as tmpdir:
+        storage_path = Path(tmpdir) / "benchmarks"
+
+        # Create benchmark suite
+        bench = Benchmark("summariser_quality", storage_path=storage_path)
+
+        candidates, references = create_sample_data()
+
+        # Run evaluation
+        print("Running benchmark evaluation...")
+        run = bench.evaluate(
+            candidates=candidates,
+            references=references,
+            metrics=["rouge_l", "bleu4"],
+            metadata={"model": "v1.0", "dataset": "test"},
+        )
+
+        print("\nBenchmark run completed:")
+        print(f"  Run ID: {run.id[:8]}...")
+        print(f"  Samples: {run.sample_count}")
+        print("  Metrics:")
+        for name, value in run.metrics.items():
+            print(f"    {name}: {value:.4f}")
+
+
+def regression_detection_example() -> None:
+    """Demonstrate regression detection with historical comparison."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        storage_path = Path(tmpdir) / "benchmarks"
+        bench = Benchmark("summariser_quality", storage_path=storage_path)
+
+        candidates, references = create_sample_data()
+
+        # Simulate historical runs with stable quality
+        print("\nBuilding baseline with historical runs...")
+        for i in range(5):
+            bench.evaluate(
+                candidates=candidates,
+                references=references,
+                metrics=["rouge_l", "bleu4"],
+                metadata={"run": f"baseline_{i}"},
+            )
+            print(f"  Baseline run {i + 1} recorded")
+
+        # Check regression (no degradation expected)
+        report = bench.check_regression(tolerance=0.05, window=5)
+        print(f"\nRegression check: {'DETECTED' if report.detected else 'NONE'}")
+
+        # Simulate a degraded model
+        print("\nSimulating degraded model output...")
+        degraded_candidates = [
+            "Policy carbon emissions.",  # Much shorter/worse
+            "Fish discovered.",
+            "Company profits.",
+            "Treatment developed.",
+            "Transport expansion.",
+        ]
+        bench.evaluate(
+            candidates=degraded_candidates,
+            references=references,
+            metrics=["rouge_l", "bleu4"],
+            metadata={"model": "v1.1-broken"},
+        )
+
+        # Check regression (should detect)
+        report = bench.check_regression(tolerance=0.05, window=5)
+        print(f"Regression check: {'DETECTED' if report.detected else 'NONE'}")
+        if report.detected:
+            print("\nRegression details:")
+            for metric, delta in report.deltas.items():
+                baseline = report.baseline.get(metric, 0)
+                current = report.current.get(metric, 0)
+                print(f"  {metric}: {baseline:.4f} -> {current:.4f} ({delta:+.4f})")
+
+
+def ci_integration_example() -> None:
+    """CI integration pattern using assert_no_regression()."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        storage_path = Path(tmpdir) / "benchmarks"
+        bench = Benchmark("ci_check", storage_path=storage_path)
+
+        candidates, references = create_sample_data()
+
+        # Build baseline
+        for _ in range(3):
+            bench.evaluate(candidates, references, metrics=["rouge_l"])
+
+        # Simulate CI check
+        print("\n" + "=" * 50)
+        print("CI Integration Example")
+        print("=" * 50)
+
+        print("\nRunning evaluation...")
+        bench.evaluate(candidates, references, metrics=["rouge_l"])
+
+        print("Checking for regression...")
+        try:
+            bench.assert_no_regression(tolerance=0.05, window=3)
+            print("No regression detected.")
+            print("CI status: EXIT 0")
+        except RegressionDetectedError as e:
+            print(f"Regression detected: {e}")
+            print("CI status: EXIT 1")
+
+
+def main() -> None:
+    """Run all benchmark examples."""
+    print("=" * 60)
+    print("Veritext Benchmark & Regression Detection Examples")
+    print("=" * 60)
+
+    run_benchmark_example()
+    regression_detection_example()
+    ci_integration_example()
+
+    print("\n" + "=" * 60)
+    print("All examples completed.")
+
+
+if __name__ == "__main__":
+    main()