From 0cadfd4d23d854115cb55bc2bf080da91c7ec82f Mon Sep 17 00:00:00 2001 From: Kai Chappell Date: Tue, 3 Feb 2026 18:20:28 +0000 Subject: [PATCH] feat(cli): add benchmark subcommands Add benchmark run, show, and check commands for quality tracking with regression detection supporting CI integration. --- src/veritext/cli/benchmark.py | 166 ++++++++++++++++++++++++++++++++++ src/veritext/cli/main.py | 2 + 2 files changed, 168 insertions(+) create mode 100644 src/veritext/cli/benchmark.py diff --git a/src/veritext/cli/benchmark.py b/src/veritext/cli/benchmark.py new file mode 100644 index 0000000..39cd517 --- /dev/null +++ b/src/veritext/cli/benchmark.py @@ -0,0 +1,166 @@ +"""Benchmark commands for quality tracking.""" + +from pathlib import Path +from typing import Annotated + +import typer + +from veritext.benchmark import Benchmark +from veritext.cli.formatters import ( + console, + format_benchmark_history, + format_regression_report, +) +from veritext.cli.readers import read_jsonl + +benchmark_app = typer.Typer( + name="benchmark", + help="Track and compare text quality over time.", + no_args_is_help=True, +) + + +@benchmark_app.command("run") +def benchmark_run( + name: Annotated[ + str, + typer.Argument(help="Name for this benchmark suite."), + ], + file: Annotated[ + Path, + typer.Option("--file", "-f", help="JSONL file with candidate/reference pairs."), + ], + metrics: Annotated[ + str, + typer.Option( + "--metrics", + "-m", + help="Comma-separated metrics to track (e.g., rouge_l,bleu4).", + ), + ] = "rouge_l,bleu4", + storage_path: Annotated[ + Path, + typer.Option( + "--storage", + "-s", + help="Directory for benchmark data storage.", + ), + ] = Path("benchmarks"), +) -> None: + """ + Run a benchmark evaluation and store the results. + + Example: + veritext benchmark run my_bench -f data.jsonl -m rouge_l,bleu4 + """ + # Read text pairs + try: + pairs = read_jsonl(file) + except (FileNotFoundError, ValueError) as e: + console.print(f"[red]Error:[/red] {e}") + raise typer.Exit(code=1) from e + + if not pairs: + console.print("[yellow]Warning:[/yellow] No text pairs found in file.") + raise typer.Exit(code=0) + + # Parse metrics + metric_names = [m.strip() for m in metrics.split(",")] + + candidates = [p.candidate for p in pairs] + references = [p.reference for p in pairs] + + # Run benchmark + bench = Benchmark(name, storage_path=storage_path) + run = bench.evaluate(candidates, references, metrics=metric_names) + + console.print(f"[green]Benchmark '{name}' completed.[/green]") + console.print(f"Samples: {run.sample_count}") + console.print("\nMetrics:") + for metric_name, value in sorted(run.metrics.items()): + console.print(f" {metric_name}: {value:.4f}") + + +@benchmark_app.command("show") +def benchmark_show( + name: Annotated[ + str, + typer.Argument(help="Name of the benchmark suite."), + ], + last: Annotated[ + int, + typer.Option("--last", "-n", help="Number of recent runs to show."), + ] = 20, + storage_path: Annotated[ + Path, + typer.Option( + "--storage", + "-s", + help="Directory for benchmark data storage.", + ), + ] = Path("benchmarks"), +) -> None: + """ + Show benchmark history for a suite. + + Example: + veritext benchmark show my_bench --last 10 + """ + bench = Benchmark(name, storage_path=storage_path) + runs = bench.get_history(limit=last) + + if not runs: + console.print(f"[yellow]No benchmark runs found for '{name}'.[/yellow]") + raise typer.Exit(code=0) + + table = format_benchmark_history(runs) + console.print(table) + + +@benchmark_app.command("check") +def benchmark_check( + name: Annotated[ + str, + typer.Argument(help="Name of the benchmark suite."), + ], + tolerance: Annotated[ + float, + typer.Option( + "--tolerance", + "-t", + help="Maximum allowed metric drop (e.g., 0.05 = 5%).", + ), + ] = 0.05, + window: Annotated[ + int, + typer.Option( + "--window", + "-w", + help="Number of historical runs for baseline.", + ), + ] = 10, + storage_path: Annotated[ + Path, + typer.Option( + "--storage", + "-s", + help="Directory for benchmark data storage.", + ), + ] = Path("benchmarks"), +) -> None: + """ + Check for quality regression against historical baseline. + + Exits with code 1 if regression detected (for CI integration). + + Example: + veritext benchmark check my_bench --tolerance 0.05 + """ + bench = Benchmark(name, storage_path=storage_path) + report = bench.check_regression(tolerance=tolerance, window=window) + + panel = format_regression_report(report) + console.print(panel) + + if report.detected: + raise typer.Exit(code=1) diff --git a/src/veritext/cli/main.py b/src/veritext/cli/main.py index a177036..cfeba73 100644 --- a/src/veritext/cli/main.py +++ b/src/veritext/cli/main.py @@ -3,6 +3,7 @@ import typer import veritext +from veritext.cli.benchmark import benchmark_app from veritext.cli.validate import validate app = typer.Typer( @@ -13,6 +14,7 @@ app = typer.Typer( # Register commands app.command()(validate) +app.add_typer(benchmark_app) @app.callback(invoke_without_command=True)