feat(cli): add benchmark subcommands

Add benchmark run, show, and check commands for quality tracking
with regression detection supporting CI integration.
This commit is contained in:
2026-02-03 18:20:28 +00:00
parent e128720917
commit 0cadfd4d23
2 changed files with 168 additions and 0 deletions

View File

@@ -0,0 +1,166 @@
"""Benchmark commands for quality tracking."""
from pathlib import Path
from typing import Annotated
import typer
from veritext.benchmark import Benchmark
from veritext.cli.formatters import (
console,
format_benchmark_history,
format_regression_report,
)
from veritext.cli.readers import read_jsonl
benchmark_app = typer.Typer(
name="benchmark",
help="Track and compare text quality over time.",
no_args_is_help=True,
)
@benchmark_app.command("run")
def benchmark_run(
name: Annotated[
str,
typer.Argument(help="Name for this benchmark suite."),
],
file: Annotated[
Path,
typer.Option("--file", "-f", help="JSONL file with candidate/reference pairs."),
],
metrics: Annotated[
str,
typer.Option(
"--metrics",
"-m",
help="Comma-separated metrics to track (e.g., rouge_l,bleu4).",
),
] = "rouge_l,bleu4",
storage_path: Annotated[
Path,
typer.Option(
"--storage",
"-s",
help="Directory for benchmark data storage.",
),
] = Path("benchmarks"),
) -> None:
"""
Run a benchmark evaluation and store the results.
Example:
veritext benchmark run my_bench -f data.jsonl -m rouge_l,bleu4
"""
# Read text pairs
try:
pairs = read_jsonl(file)
except (FileNotFoundError, ValueError) as e:
console.print(f"[red]Error:[/red] {e}")
raise typer.Exit(code=1) from e
if not pairs:
console.print("[yellow]Warning:[/yellow] No text pairs found in file.")
raise typer.Exit(code=0)
# Parse metrics
metric_names = [m.strip() for m in metrics.split(",")]
candidates = [p.candidate for p in pairs]
references = [p.reference for p in pairs]
# Run benchmark
bench = Benchmark(name, storage_path=storage_path)
run = bench.evaluate(candidates, references, metrics=metric_names)
console.print(f"[green]Benchmark '{name}' completed.[/green]")
console.print(f"Samples: {run.sample_count}")
console.print("\nMetrics:")
for metric_name, value in sorted(run.metrics.items()):
console.print(f" {metric_name}: {value:.4f}")
@benchmark_app.command("show")
def benchmark_show(
name: Annotated[
str,
typer.Argument(help="Name of the benchmark suite."),
],
last: Annotated[
int,
typer.Option("--last", "-n", help="Number of recent runs to show."),
] = 20,
storage_path: Annotated[
Path,
typer.Option(
"--storage",
"-s",
help="Directory for benchmark data storage.",
),
] = Path("benchmarks"),
) -> None:
"""
Show benchmark history for a suite.
Example:
veritext benchmark show my_bench --last 10
"""
bench = Benchmark(name, storage_path=storage_path)
runs = bench.get_history(limit=last)
if not runs:
console.print(f"[yellow]No benchmark runs found for '{name}'.[/yellow]")
raise typer.Exit(code=0)
table = format_benchmark_history(runs)
console.print(table)
@benchmark_app.command("check")
def benchmark_check(
name: Annotated[
str,
typer.Argument(help="Name of the benchmark suite."),
],
tolerance: Annotated[
float,
typer.Option(
"--tolerance",
"-t",
help="Maximum allowed metric drop (e.g., 0.05 = 5%).",
),
] = 0.05,
window: Annotated[
int,
typer.Option(
"--window",
"-w",
help="Number of historical runs for baseline.",
),
] = 10,
storage_path: Annotated[
Path,
typer.Option(
"--storage",
"-s",
help="Directory for benchmark data storage.",
),
] = Path("benchmarks"),
) -> None:
"""
Check for quality regression against historical baseline.
Exits with code 1 if regression detected (for CI integration).
Example:
veritext benchmark check my_bench --tolerance 0.05
"""
bench = Benchmark(name, storage_path=storage_path)
report = bench.check_regression(tolerance=tolerance, window=window)
panel = format_regression_report(report)
console.print(panel)
if report.detected:
raise typer.Exit(code=1)

View File

@@ -3,6 +3,7 @@
import typer
import veritext
from veritext.cli.benchmark import benchmark_app
from veritext.cli.validate import validate
app = typer.Typer(
@@ -13,6 +14,7 @@ app = typer.Typer(
# Register commands
app.command()(validate)
app.add_typer(benchmark_app)
@app.callback(invoke_without_command=True)