cli benchmark subcommands

Add benchmark run, show, and check commands for quality tracking with regression detection supporting CI integration.
2025-05-10 12:01:08 +00:00
parent b02023c8f6
commit 5f619a626b
2 changed files with 168 additions and 0 deletions
@@ -0,0 +1,166 @@
 """Benchmark commands for quality tracking."""
 from pathlib import Path
 from typing import Annotated
 import typer
 from veritext.benchmark import Benchmark
 from veritext.cli.formatters import (
    console,
    format_benchmark_history,
    format_regression_report,
 )
 from veritext.cli.readers import read_jsonl
 benchmark_app = typer.Typer(
    name="benchmark",
    help="Track and compare text quality over time.",
    no_args_is_help=True,
 )
@benchmark_app.command("run")
 def benchmark_run(
    name: Annotated[
        str,
        typer.Argument(help="Name for this benchmark suite."),
    ],
    file: Annotated[
        Path,
        typer.Option("--file", "-f", help="JSONL file with candidate/reference pairs."),
    ],
    metrics: Annotated[
        str,
        typer.Option(
            "--metrics",
            "-m",
            help="Comma-separated metrics to track (e.g., rouge_l,bleu4).",
        ),
    ] = "rouge_l,bleu4",
    storage_path: Annotated[
        Path,
        typer.Option(
            "--storage",
            "-s",
            help="Directory for benchmark data storage.",
        ),
    ] = Path("benchmarks"),
 ) -> None:
    """
    Run a benchmark evaluation and store the results.
    Example:
        veritext benchmark run my_bench -f data.jsonl -m rouge_l,bleu4
    """
    # Read text pairs
    try:
        pairs = read_jsonl(file)
    except (FileNotFoundError, ValueError) as e:
        console.print(f"[red]Error:[/red] {e}")
        raise typer.Exit(code=1) from e
    if not pairs:
        console.print("[yellow]Warning:[/yellow] No text pairs found in file.")
        raise typer.Exit(code=0)
    # Parse metrics
    metric_names = [m.strip() for m in metrics.split(",")]
    candidates = [p.candidate for p in pairs]
    references = [p.reference for p in pairs]
    # Run benchmark
    bench = Benchmark(name, storage_path=storage_path)
    run = bench.evaluate(candidates, references, metrics=metric_names)
    console.print(f"[green]Benchmark '{name}' completed.[/green]")
    console.print(f"Samples: {run.sample_count}")
    console.print("\nMetrics:")
    for metric_name, value in sorted(run.metrics.items()):
        console.print(f"  {metric_name}: {value:.4f}")
@benchmark_app.command("show")
 def benchmark_show(
    name: Annotated[
        str,
        typer.Argument(help="Name of the benchmark suite."),
    ],
    last: Annotated[
        int,
        typer.Option("--last", "-n", help="Number of recent runs to show."),
    ] = 20,
    storage_path: Annotated[
        Path,
        typer.Option(
            "--storage",
            "-s",
            help="Directory for benchmark data storage.",
        ),
    ] = Path("benchmarks"),
 ) -> None:
    """
    Show benchmark history for a suite.
    Example:
        veritext benchmark show my_bench --last 10
    """
    bench = Benchmark(name, storage_path=storage_path)
    runs = bench.get_history(limit=last)
    if not runs:
        console.print(f"[yellow]No benchmark runs found for '{name}'.[/yellow]")
        raise typer.Exit(code=0)
    table = format_benchmark_history(runs)
    console.print(table)
@benchmark_app.command("check")
 def benchmark_check(
    name: Annotated[
        str,
        typer.Argument(help="Name of the benchmark suite."),
    ],
    tolerance: Annotated[
        float,
        typer.Option(
            "--tolerance",
            "-t",
            help="Maximum allowed metric drop (e.g., 0.05 = 5%).",
        ),
    ] = 0.05,
    window: Annotated[
        int,
        typer.Option(
            "--window",
            "-w",
            help="Number of historical runs for baseline.",
        ),
    ] = 10,
    storage_path: Annotated[
        Path,
        typer.Option(
            "--storage",
            "-s",
            help="Directory for benchmark data storage.",
        ),
    ] = Path("benchmarks"),
 ) -> None:
    """
    Check for quality regression against historical baseline.
    Exits with code 1 if regression detected (for CI integration).
    Example:
        veritext benchmark check my_bench --tolerance 0.05
    """
    bench = Benchmark(name, storage_path=storage_path)
    report = bench.check_regression(tolerance=tolerance, window=window)
    panel = format_regression_report(report)
    console.print(panel)
    if report.detected:
        raise typer.Exit(code=1)
@@ -3,6 +3,7 @@
 import typer
 import veritext
 from veritext.cli.benchmark import benchmark_app
 from veritext.cli.validate import validate
 app = typer.Typer(
@@ -13,6 +14,7 @@ app = typer.Typer(
 # Register commands
 app.command()(validate)
 app.add_typer(benchmark_app)
@app.callback(invoke_without_command=True)