cli benchmark subcommands

Add benchmark run, show, and check commands for quality tracking with regression detection supporting CI integration.
2025-05-10 12:01:08 +00:00
parent b02023c8f6
commit 5f619a626b
2 changed files with 168 additions and 0 deletions
@@ -0,0 +1,166 @@
+"""Benchmark commands for quality tracking."""
+
+from pathlib import Path
+from typing import Annotated
+
+import typer
+
+from veritext.benchmark import Benchmark
+from veritext.cli.formatters import (
+    console,
+    format_benchmark_history,
+    format_regression_report,
+)
+from veritext.cli.readers import read_jsonl
+
+benchmark_app = typer.Typer(
+    name="benchmark",
+    help="Track and compare text quality over time.",
+    no_args_is_help=True,
+)
+
+
+@benchmark_app.command("run")
+def benchmark_run(
+    name: Annotated[
+        str,
+        typer.Argument(help="Name for this benchmark suite."),
+    ],
+    file: Annotated[
+        Path,
+        typer.Option("--file", "-f", help="JSONL file with candidate/reference pairs."),
+    ],
+    metrics: Annotated[
+        str,
+        typer.Option(
+            "--metrics",
+            "-m",
+            help="Comma-separated metrics to track (e.g., rouge_l,bleu4).",
+        ),
+    ] = "rouge_l,bleu4",
+    storage_path: Annotated[
+        Path,
+        typer.Option(
+            "--storage",
+            "-s",
+            help="Directory for benchmark data storage.",
+        ),
+    ] = Path("benchmarks"),
+) -> None:
+    """
+    Run a benchmark evaluation and store the results.
+
+    Example:
+        veritext benchmark run my_bench -f data.jsonl -m rouge_l,bleu4
+    """
+    # Read text pairs
+    try:
+        pairs = read_jsonl(file)
+    except (FileNotFoundError, ValueError) as e:
+        console.print(f"[red]Error:[/red] {e}")
+        raise typer.Exit(code=1) from e
+
+    if not pairs:
+        console.print("[yellow]Warning:[/yellow] No text pairs found in file.")
+        raise typer.Exit(code=0)
+
+    # Parse metrics
+    metric_names = [m.strip() for m in metrics.split(",")]
+
+    candidates = [p.candidate for p in pairs]
+    references = [p.reference for p in pairs]
+
+    # Run benchmark
+    bench = Benchmark(name, storage_path=storage_path)
+    run = bench.evaluate(candidates, references, metrics=metric_names)
+
+    console.print(f"[green]Benchmark '{name}' completed.[/green]")
+    console.print(f"Samples: {run.sample_count}")
+    console.print("\nMetrics:")
+    for metric_name, value in sorted(run.metrics.items()):
+        console.print(f"  {metric_name}: {value:.4f}")
+
+
+@benchmark_app.command("show")
+def benchmark_show(
+    name: Annotated[
+        str,
+        typer.Argument(help="Name of the benchmark suite."),
+    ],
+    last: Annotated[
+        int,
+        typer.Option("--last", "-n", help="Number of recent runs to show."),
+    ] = 20,
+    storage_path: Annotated[
+        Path,
+        typer.Option(
+            "--storage",
+            "-s",
+            help="Directory for benchmark data storage.",
+        ),
+    ] = Path("benchmarks"),
+) -> None:
+    """
+    Show benchmark history for a suite.
+
+    Example:
+        veritext benchmark show my_bench --last 10
+    """
+    bench = Benchmark(name, storage_path=storage_path)
+    runs = bench.get_history(limit=last)
+
+    if not runs:
+        console.print(f"[yellow]No benchmark runs found for '{name}'.[/yellow]")
+        raise typer.Exit(code=0)
+
+    table = format_benchmark_history(runs)
+    console.print(table)
+
+
+@benchmark_app.command("check")
+def benchmark_check(
+    name: Annotated[
+        str,
+        typer.Argument(help="Name of the benchmark suite."),
+    ],
+    tolerance: Annotated[
+        float,
+        typer.Option(
+            "--tolerance",
+            "-t",
+            help="Maximum allowed metric drop (e.g., 0.05 = 5%).",
+        ),
+    ] = 0.05,
+    window: Annotated[
+        int,
+        typer.Option(
+            "--window",
+            "-w",
+            help="Number of historical runs for baseline.",
+        ),
+    ] = 10,
+    storage_path: Annotated[
+        Path,
+        typer.Option(
+            "--storage",
+            "-s",
+            help="Directory for benchmark data storage.",
+        ),
+    ] = Path("benchmarks"),
+) -> None:
+    """
+    Check for quality regression against historical baseline.
+
+    Exits with code 1 if regression detected (for CI integration).
+
+    Example:
+        veritext benchmark check my_bench --tolerance 0.05
+    """
+    bench = Benchmark(name, storage_path=storage_path)
+    report = bench.check_regression(tolerance=tolerance, window=window)
+
+    panel = format_regression_report(report)
+    console.print(panel)
+
+    if report.detected:
+        raise typer.Exit(code=1)
@@ -3,6 +3,7 @@
 import typer

 import veritext
+from veritext.cli.benchmark import benchmark_app
 from veritext.cli.validate import validate

 app = typer.Typer(
@@ -13,6 +14,7 @@ app = typer.Typer(

 # Register commands
 app.command()(validate)
+app.add_typer(benchmark_app)


@app.callback(invoke_without_command=True)