cli benchmark subcommands
Add benchmark run, show, and check commands for quality tracking with regression detection supporting CI integration.
This commit is contained in:
166
src/veritext/cli/benchmark.py
Normal file
166
src/veritext/cli/benchmark.py
Normal file
@@ -0,0 +1,166 @@
|
||||
"""Benchmark commands for quality tracking."""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Annotated
|
||||
|
||||
import typer
|
||||
|
||||
from veritext.benchmark import Benchmark
|
||||
from veritext.cli.formatters import (
|
||||
console,
|
||||
format_benchmark_history,
|
||||
format_regression_report,
|
||||
)
|
||||
from veritext.cli.readers import read_jsonl
|
||||
|
||||
benchmark_app = typer.Typer(
|
||||
name="benchmark",
|
||||
help="Track and compare text quality over time.",
|
||||
no_args_is_help=True,
|
||||
)
|
||||
|
||||
|
||||
@benchmark_app.command("run")
|
||||
def benchmark_run(
|
||||
name: Annotated[
|
||||
str,
|
||||
typer.Argument(help="Name for this benchmark suite."),
|
||||
],
|
||||
file: Annotated[
|
||||
Path,
|
||||
typer.Option("--file", "-f", help="JSONL file with candidate/reference pairs."),
|
||||
],
|
||||
metrics: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
"--metrics",
|
||||
"-m",
|
||||
help="Comma-separated metrics to track (e.g., rouge_l,bleu4).",
|
||||
),
|
||||
] = "rouge_l,bleu4",
|
||||
storage_path: Annotated[
|
||||
Path,
|
||||
typer.Option(
|
||||
"--storage",
|
||||
"-s",
|
||||
help="Directory for benchmark data storage.",
|
||||
),
|
||||
] = Path("benchmarks"),
|
||||
) -> None:
|
||||
"""
|
||||
Run a benchmark evaluation and store the results.
|
||||
|
||||
Example:
|
||||
veritext benchmark run my_bench -f data.jsonl -m rouge_l,bleu4
|
||||
"""
|
||||
# Read text pairs
|
||||
try:
|
||||
pairs = read_jsonl(file)
|
||||
except (FileNotFoundError, ValueError) as e:
|
||||
console.print(f"[red]Error:[/red] {e}")
|
||||
raise typer.Exit(code=1) from e
|
||||
|
||||
if not pairs:
|
||||
console.print("[yellow]Warning:[/yellow] No text pairs found in file.")
|
||||
raise typer.Exit(code=0)
|
||||
|
||||
# Parse metrics
|
||||
metric_names = [m.strip() for m in metrics.split(",")]
|
||||
|
||||
candidates = [p.candidate for p in pairs]
|
||||
references = [p.reference for p in pairs]
|
||||
|
||||
# Run benchmark
|
||||
bench = Benchmark(name, storage_path=storage_path)
|
||||
run = bench.evaluate(candidates, references, metrics=metric_names)
|
||||
|
||||
console.print(f"[green]Benchmark '{name}' completed.[/green]")
|
||||
console.print(f"Samples: {run.sample_count}")
|
||||
console.print("\nMetrics:")
|
||||
for metric_name, value in sorted(run.metrics.items()):
|
||||
console.print(f" {metric_name}: {value:.4f}")
|
||||
|
||||
|
||||
@benchmark_app.command("show")
|
||||
def benchmark_show(
|
||||
name: Annotated[
|
||||
str,
|
||||
typer.Argument(help="Name of the benchmark suite."),
|
||||
],
|
||||
last: Annotated[
|
||||
int,
|
||||
typer.Option("--last", "-n", help="Number of recent runs to show."),
|
||||
] = 20,
|
||||
storage_path: Annotated[
|
||||
Path,
|
||||
typer.Option(
|
||||
"--storage",
|
||||
"-s",
|
||||
help="Directory for benchmark data storage.",
|
||||
),
|
||||
] = Path("benchmarks"),
|
||||
) -> None:
|
||||
"""
|
||||
Show benchmark history for a suite.
|
||||
|
||||
Example:
|
||||
veritext benchmark show my_bench --last 10
|
||||
"""
|
||||
bench = Benchmark(name, storage_path=storage_path)
|
||||
runs = bench.get_history(limit=last)
|
||||
|
||||
if not runs:
|
||||
console.print(f"[yellow]No benchmark runs found for '{name}'.[/yellow]")
|
||||
raise typer.Exit(code=0)
|
||||
|
||||
table = format_benchmark_history(runs)
|
||||
console.print(table)
|
||||
|
||||
|
||||
@benchmark_app.command("check")
|
||||
def benchmark_check(
|
||||
name: Annotated[
|
||||
str,
|
||||
typer.Argument(help="Name of the benchmark suite."),
|
||||
],
|
||||
tolerance: Annotated[
|
||||
float,
|
||||
typer.Option(
|
||||
"--tolerance",
|
||||
"-t",
|
||||
help="Maximum allowed metric drop (e.g., 0.05 = 5%).",
|
||||
),
|
||||
] = 0.05,
|
||||
window: Annotated[
|
||||
int,
|
||||
typer.Option(
|
||||
"--window",
|
||||
"-w",
|
||||
help="Number of historical runs for baseline.",
|
||||
),
|
||||
] = 10,
|
||||
storage_path: Annotated[
|
||||
Path,
|
||||
typer.Option(
|
||||
"--storage",
|
||||
"-s",
|
||||
help="Directory for benchmark data storage.",
|
||||
),
|
||||
] = Path("benchmarks"),
|
||||
) -> None:
|
||||
"""
|
||||
Check for quality regression against historical baseline.
|
||||
|
||||
Exits with code 1 if regression detected (for CI integration).
|
||||
|
||||
Example:
|
||||
veritext benchmark check my_bench --tolerance 0.05
|
||||
"""
|
||||
bench = Benchmark(name, storage_path=storage_path)
|
||||
report = bench.check_regression(tolerance=tolerance, window=window)
|
||||
|
||||
panel = format_regression_report(report)
|
||||
console.print(panel)
|
||||
|
||||
if report.detected:
|
||||
raise typer.Exit(code=1)
|
||||
@@ -3,6 +3,7 @@
|
||||
import typer
|
||||
|
||||
import veritext
|
||||
from veritext.cli.benchmark import benchmark_app
|
||||
from veritext.cli.validate import validate
|
||||
|
||||
app = typer.Typer(
|
||||
@@ -13,6 +14,7 @@ app = typer.Typer(
|
||||
|
||||
# Register commands
|
||||
app.command()(validate)
|
||||
app.add_typer(benchmark_app)
|
||||
|
||||
|
||||
@app.callback(invoke_without_command=True)
|
||||
|
||||
Reference in New Issue
Block a user