cli benchmark subcommands
Add benchmark run, show, and check commands for quality tracking with regression detection supporting CI integration.
This commit is contained in:
166
src/veritext/cli/benchmark.py
Normal file
166
src/veritext/cli/benchmark.py
Normal file
@@ -0,0 +1,166 @@
|
|||||||
|
"""Benchmark commands for quality tracking."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
import typer
|
||||||
|
|
||||||
|
from veritext.benchmark import Benchmark
|
||||||
|
from veritext.cli.formatters import (
|
||||||
|
console,
|
||||||
|
format_benchmark_history,
|
||||||
|
format_regression_report,
|
||||||
|
)
|
||||||
|
from veritext.cli.readers import read_jsonl
|
||||||
|
|
||||||
|
benchmark_app = typer.Typer(
|
||||||
|
name="benchmark",
|
||||||
|
help="Track and compare text quality over time.",
|
||||||
|
no_args_is_help=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@benchmark_app.command("run")
|
||||||
|
def benchmark_run(
|
||||||
|
name: Annotated[
|
||||||
|
str,
|
||||||
|
typer.Argument(help="Name for this benchmark suite."),
|
||||||
|
],
|
||||||
|
file: Annotated[
|
||||||
|
Path,
|
||||||
|
typer.Option("--file", "-f", help="JSONL file with candidate/reference pairs."),
|
||||||
|
],
|
||||||
|
metrics: Annotated[
|
||||||
|
str,
|
||||||
|
typer.Option(
|
||||||
|
"--metrics",
|
||||||
|
"-m",
|
||||||
|
help="Comma-separated metrics to track (e.g., rouge_l,bleu4).",
|
||||||
|
),
|
||||||
|
] = "rouge_l,bleu4",
|
||||||
|
storage_path: Annotated[
|
||||||
|
Path,
|
||||||
|
typer.Option(
|
||||||
|
"--storage",
|
||||||
|
"-s",
|
||||||
|
help="Directory for benchmark data storage.",
|
||||||
|
),
|
||||||
|
] = Path("benchmarks"),
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Run a benchmark evaluation and store the results.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
veritext benchmark run my_bench -f data.jsonl -m rouge_l,bleu4
|
||||||
|
"""
|
||||||
|
# Read text pairs
|
||||||
|
try:
|
||||||
|
pairs = read_jsonl(file)
|
||||||
|
except (FileNotFoundError, ValueError) as e:
|
||||||
|
console.print(f"[red]Error:[/red] {e}")
|
||||||
|
raise typer.Exit(code=1) from e
|
||||||
|
|
||||||
|
if not pairs:
|
||||||
|
console.print("[yellow]Warning:[/yellow] No text pairs found in file.")
|
||||||
|
raise typer.Exit(code=0)
|
||||||
|
|
||||||
|
# Parse metrics
|
||||||
|
metric_names = [m.strip() for m in metrics.split(",")]
|
||||||
|
|
||||||
|
candidates = [p.candidate for p in pairs]
|
||||||
|
references = [p.reference for p in pairs]
|
||||||
|
|
||||||
|
# Run benchmark
|
||||||
|
bench = Benchmark(name, storage_path=storage_path)
|
||||||
|
run = bench.evaluate(candidates, references, metrics=metric_names)
|
||||||
|
|
||||||
|
console.print(f"[green]Benchmark '{name}' completed.[/green]")
|
||||||
|
console.print(f"Samples: {run.sample_count}")
|
||||||
|
console.print("\nMetrics:")
|
||||||
|
for metric_name, value in sorted(run.metrics.items()):
|
||||||
|
console.print(f" {metric_name}: {value:.4f}")
|
||||||
|
|
||||||
|
|
||||||
|
@benchmark_app.command("show")
|
||||||
|
def benchmark_show(
|
||||||
|
name: Annotated[
|
||||||
|
str,
|
||||||
|
typer.Argument(help="Name of the benchmark suite."),
|
||||||
|
],
|
||||||
|
last: Annotated[
|
||||||
|
int,
|
||||||
|
typer.Option("--last", "-n", help="Number of recent runs to show."),
|
||||||
|
] = 20,
|
||||||
|
storage_path: Annotated[
|
||||||
|
Path,
|
||||||
|
typer.Option(
|
||||||
|
"--storage",
|
||||||
|
"-s",
|
||||||
|
help="Directory for benchmark data storage.",
|
||||||
|
),
|
||||||
|
] = Path("benchmarks"),
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Show benchmark history for a suite.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
veritext benchmark show my_bench --last 10
|
||||||
|
"""
|
||||||
|
bench = Benchmark(name, storage_path=storage_path)
|
||||||
|
runs = bench.get_history(limit=last)
|
||||||
|
|
||||||
|
if not runs:
|
||||||
|
console.print(f"[yellow]No benchmark runs found for '{name}'.[/yellow]")
|
||||||
|
raise typer.Exit(code=0)
|
||||||
|
|
||||||
|
table = format_benchmark_history(runs)
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
|
||||||
|
@benchmark_app.command("check")
|
||||||
|
def benchmark_check(
|
||||||
|
name: Annotated[
|
||||||
|
str,
|
||||||
|
typer.Argument(help="Name of the benchmark suite."),
|
||||||
|
],
|
||||||
|
tolerance: Annotated[
|
||||||
|
float,
|
||||||
|
typer.Option(
|
||||||
|
"--tolerance",
|
||||||
|
"-t",
|
||||||
|
help="Maximum allowed metric drop (e.g., 0.05 = 5%).",
|
||||||
|
),
|
||||||
|
] = 0.05,
|
||||||
|
window: Annotated[
|
||||||
|
int,
|
||||||
|
typer.Option(
|
||||||
|
"--window",
|
||||||
|
"-w",
|
||||||
|
help="Number of historical runs for baseline.",
|
||||||
|
),
|
||||||
|
] = 10,
|
||||||
|
storage_path: Annotated[
|
||||||
|
Path,
|
||||||
|
typer.Option(
|
||||||
|
"--storage",
|
||||||
|
"-s",
|
||||||
|
help="Directory for benchmark data storage.",
|
||||||
|
),
|
||||||
|
] = Path("benchmarks"),
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Check for quality regression against historical baseline.
|
||||||
|
|
||||||
|
Exits with code 1 if regression detected (for CI integration).
|
||||||
|
|
||||||
|
Example:
|
||||||
|
veritext benchmark check my_bench --tolerance 0.05
|
||||||
|
"""
|
||||||
|
bench = Benchmark(name, storage_path=storage_path)
|
||||||
|
report = bench.check_regression(tolerance=tolerance, window=window)
|
||||||
|
|
||||||
|
panel = format_regression_report(report)
|
||||||
|
console.print(panel)
|
||||||
|
|
||||||
|
if report.detected:
|
||||||
|
raise typer.Exit(code=1)
|
||||||
@@ -3,6 +3,7 @@
|
|||||||
import typer
|
import typer
|
||||||
|
|
||||||
import veritext
|
import veritext
|
||||||
|
from veritext.cli.benchmark import benchmark_app
|
||||||
from veritext.cli.validate import validate
|
from veritext.cli.validate import validate
|
||||||
|
|
||||||
app = typer.Typer(
|
app = typer.Typer(
|
||||||
@@ -13,6 +14,7 @@ app = typer.Typer(
|
|||||||
|
|
||||||
# Register commands
|
# Register commands
|
||||||
app.command()(validate)
|
app.command()(validate)
|
||||||
|
app.add_typer(benchmark_app)
|
||||||
|
|
||||||
|
|
||||||
@app.callback(invoke_without_command=True)
|
@app.callback(invoke_without_command=True)
|
||||||
|
|||||||
Reference in New Issue
Block a user