veritext/examples/benchmark_regression.py

"""Benchmark quality tracking with regression detection.

Demonstrates Veritext's benchmark module for CI integration:
- Creating a benchmark suite
- Running evaluations and storing results
- Checking for quality regression
- CI integration pattern with exit codes
"""

import tempfile
from pathlib import Path

from veritext.benchmark import Benchmark
from veritext.core.exceptions import RegressionDetectedError


def create_sample_data() -> tuple[list[str], list[str]]:
    """Create sample candidate/reference pairs for benchmarking."""
    # Simulated summarisation outputs and references
    candidates = [
        "The new policy aims to reduce carbon emissions by 50% by 2030.",
        "Scientists discovered a new species of deep-sea fish.",
        "The company reported record profits in the third quarter.",
        "Researchers developed a breakthrough treatment for the disease.",
        "The city plans to expand public transportation routes.",
    ]
    references = [
        "The policy targets a 50% reduction in carbon emissions by 2030.",
        "A new deep-sea fish species was discovered by marine biologists.",
        "Record profits were announced by the company for Q3.",
        "A breakthrough disease treatment was developed by researchers.",
        "Public transport expansion is planned for the city.",
    ]
    return candidates, references


def run_benchmark_example() -> None:
    """Run a benchmark evaluation and view results."""
    # Use a temp directory for this example
    with tempfile.TemporaryDirectory() as tmpdir:
        storage_path = Path(tmpdir) / "benchmarks"

        # Create benchmark suite
        bench = Benchmark("summariser_quality", storage_path=storage_path)

        candidates, references = create_sample_data()

        # Run evaluation
        print("Running benchmark evaluation...")
        run = bench.evaluate(
            candidates=candidates,
            references=references,
            metrics=["rouge_l", "bleu4"],
            metadata={"model": "v1.0", "dataset": "test"},
        )

        print("\nBenchmark run completed:")
        print(f"  Run ID: {run.id[:8]}...")
        print(f"  Samples: {run.sample_count}")
        print("  Metrics:")
        for name, value in run.metrics.items():
            print(f"    {name}: {value:.4f}")


def regression_detection_example() -> None:
    """Demonstrate regression detection with historical comparison."""
    with tempfile.TemporaryDirectory() as tmpdir:
        storage_path = Path(tmpdir) / "benchmarks"
        bench = Benchmark("summariser_quality", storage_path=storage_path)

        candidates, references = create_sample_data()

        # Simulate historical runs with stable quality
        print("\nBuilding baseline with historical runs...")
        for i in range(5):
            bench.evaluate(
                candidates=candidates,
                references=references,
                metrics=["rouge_l", "bleu4"],
                metadata={"run": f"baseline_{i}"},
            )
            print(f"  Baseline run {i + 1} recorded")

        # Check regression (no degradation expected)
        report = bench.check_regression(tolerance=0.05, window=5)
        print(f"\nRegression check: {'DETECTED' if report.detected else 'NONE'}")

        # Simulate a degraded model
        print("\nSimulating degraded model output...")
        degraded_candidates = [
            "Policy carbon emissions.",  # Much shorter/worse
            "Fish discovered.",
            "Company profits.",
            "Treatment developed.",
            "Transport expansion.",
        ]
        bench.evaluate(
            candidates=degraded_candidates,
            references=references,
            metrics=["rouge_l", "bleu4"],
            metadata={"model": "v1.1-broken"},
        )

        # Check regression (should detect)
        report = bench.check_regression(tolerance=0.05, window=5)
        print(f"Regression check: {'DETECTED' if report.detected else 'NONE'}")
        if report.detected:
            print("\nRegression details:")
            for metric, delta in report.deltas.items():
                baseline = report.baseline.get(metric, 0)
                current = report.current.get(metric, 0)
                print(f"  {metric}: {baseline:.4f} -> {current:.4f} ({delta:+.4f})")


def ci_integration_example() -> None:
    """CI integration pattern using assert_no_regression()."""
    with tempfile.TemporaryDirectory() as tmpdir:
        storage_path = Path(tmpdir) / "benchmarks"
        bench = Benchmark("ci_check", storage_path=storage_path)

        candidates, references = create_sample_data()

        # Build baseline
        for _ in range(3):
            bench.evaluate(candidates, references, metrics=["rouge_l"])

        # Simulate CI check
        print("\n" + "=" * 50)
        print("CI Integration Example")
        print("=" * 50)

        print("\nRunning evaluation...")
        bench.evaluate(candidates, references, metrics=["rouge_l"])

        print("Checking for regression...")
        try:
            bench.assert_no_regression(tolerance=0.05, window=3)
            print("No regression detected.")
            print("CI status: EXIT 0")
        except RegressionDetectedError as e:
            print(f"Regression detected: {e}")
            print("CI status: EXIT 1")


def main() -> None:
    """Run all benchmark examples."""
    print("=" * 60)
    print("Veritext Benchmark & Regression Detection Examples")
    print("=" * 60)

    run_benchmark_example()
    regression_detection_example()
    ci_integration_example()

    print("\n" + "=" * 60)
    print("All examples completed.")


if __name__ == "__main__":
    main()