Compare commits
8 Commits
feat/bench
...
feat/cli
| Author | SHA1 | Date | |
|---|---|---|---|
|
d5df8b52e6
|
|||
|
8b7c087de7
|
|||
|
c54f8c3f6f
|
|||
|
0cadfd4d23
|
|||
|
e128720917
|
|||
|
f713d5e8a6
|
|||
|
9853b57843
|
|||
|
55faae3e1b
|
@@ -83,6 +83,11 @@ Each layer depends only on layers below it.
|
|||||||
|
|
||||||
## Git Workflow
|
## Git Workflow
|
||||||
|
|
||||||
|
### Before Starting Work
|
||||||
|
|
||||||
|
When starting work from a plan, create a new branch matching the plan's scope before
|
||||||
|
making any changes. Do not reuse an existing branch from previous work, even if related.
|
||||||
|
|
||||||
### Commits
|
### Commits
|
||||||
|
|
||||||
- Format: `type(scope): description`
|
- Format: `type(scope): description`
|
||||||
|
|||||||
@@ -45,3 +45,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
- `assert_no_regression()` raises `RegressionDetectedError` for CI integration
|
- `assert_no_regression()` raises `RegressionDetectedError` for CI integration
|
||||||
- Customisable tolerance threshold and window size for regression detection
|
- Customisable tolerance threshold and window size for regression detection
|
||||||
- Metadata support for tracking git SHA, model versions, etc.
|
- Metadata support for tracking git SHA, model versions, etc.
|
||||||
|
- Command-line interface (CLI) via `veritext` command
|
||||||
|
- `veritext validate` command for inline and file-based text validation
|
||||||
|
- JSONL input format support for batch validation (`--file` option)
|
||||||
|
- Separate candidate/reference file support (`--reference-file` option)
|
||||||
|
- Multiple output formats: table (default), JSON, and simple text
|
||||||
|
- `veritext benchmark run` command for running evaluations and storing results
|
||||||
|
- `veritext benchmark show` command for viewing benchmark history
|
||||||
|
- `veritext benchmark check` command for regression detection with exit code 1 on failure
|
||||||
|
- Rich-formatted terminal output with tables and coloured panels
|
||||||
|
|||||||
5
src/veritext/cli/__init__.py
Normal file
5
src/veritext/cli/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
"""CLI module: Command-line interface for Veritext."""
|
||||||
|
|
||||||
|
from veritext.cli.main import app
|
||||||
|
|
||||||
|
__all__ = ["app"]
|
||||||
166
src/veritext/cli/benchmark.py
Normal file
166
src/veritext/cli/benchmark.py
Normal file
@@ -0,0 +1,166 @@
|
|||||||
|
"""Benchmark commands for quality tracking."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
import typer
|
||||||
|
|
||||||
|
from veritext.benchmark import Benchmark
|
||||||
|
from veritext.cli.formatters import (
|
||||||
|
console,
|
||||||
|
format_benchmark_history,
|
||||||
|
format_regression_report,
|
||||||
|
)
|
||||||
|
from veritext.cli.readers import read_jsonl
|
||||||
|
|
||||||
|
benchmark_app = typer.Typer(
|
||||||
|
name="benchmark",
|
||||||
|
help="Track and compare text quality over time.",
|
||||||
|
no_args_is_help=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@benchmark_app.command("run")
|
||||||
|
def benchmark_run(
|
||||||
|
name: Annotated[
|
||||||
|
str,
|
||||||
|
typer.Argument(help="Name for this benchmark suite."),
|
||||||
|
],
|
||||||
|
file: Annotated[
|
||||||
|
Path,
|
||||||
|
typer.Option("--file", "-f", help="JSONL file with candidate/reference pairs."),
|
||||||
|
],
|
||||||
|
metrics: Annotated[
|
||||||
|
str,
|
||||||
|
typer.Option(
|
||||||
|
"--metrics",
|
||||||
|
"-m",
|
||||||
|
help="Comma-separated metrics to track (e.g., rouge_l,bleu4).",
|
||||||
|
),
|
||||||
|
] = "rouge_l,bleu4",
|
||||||
|
storage_path: Annotated[
|
||||||
|
Path,
|
||||||
|
typer.Option(
|
||||||
|
"--storage",
|
||||||
|
"-s",
|
||||||
|
help="Directory for benchmark data storage.",
|
||||||
|
),
|
||||||
|
] = Path("benchmarks"),
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Run a benchmark evaluation and store the results.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
veritext benchmark run my_bench -f data.jsonl -m rouge_l,bleu4
|
||||||
|
"""
|
||||||
|
# Read text pairs
|
||||||
|
try:
|
||||||
|
pairs = read_jsonl(file)
|
||||||
|
except (FileNotFoundError, ValueError) as e:
|
||||||
|
console.print(f"[red]Error:[/red] {e}")
|
||||||
|
raise typer.Exit(code=1) from e
|
||||||
|
|
||||||
|
if not pairs:
|
||||||
|
console.print("[yellow]Warning:[/yellow] No text pairs found in file.")
|
||||||
|
raise typer.Exit(code=0)
|
||||||
|
|
||||||
|
# Parse metrics
|
||||||
|
metric_names = [m.strip() for m in metrics.split(",")]
|
||||||
|
|
||||||
|
candidates = [p.candidate for p in pairs]
|
||||||
|
references = [p.reference for p in pairs]
|
||||||
|
|
||||||
|
# Run benchmark
|
||||||
|
bench = Benchmark(name, storage_path=storage_path)
|
||||||
|
run = bench.evaluate(candidates, references, metrics=metric_names)
|
||||||
|
|
||||||
|
console.print(f"[green]Benchmark '{name}' completed.[/green]")
|
||||||
|
console.print(f"Samples: {run.sample_count}")
|
||||||
|
console.print("\nMetrics:")
|
||||||
|
for metric_name, value in sorted(run.metrics.items()):
|
||||||
|
console.print(f" {metric_name}: {value:.4f}")
|
||||||
|
|
||||||
|
|
||||||
|
@benchmark_app.command("show")
|
||||||
|
def benchmark_show(
|
||||||
|
name: Annotated[
|
||||||
|
str,
|
||||||
|
typer.Argument(help="Name of the benchmark suite."),
|
||||||
|
],
|
||||||
|
last: Annotated[
|
||||||
|
int,
|
||||||
|
typer.Option("--last", "-n", help="Number of recent runs to show."),
|
||||||
|
] = 20,
|
||||||
|
storage_path: Annotated[
|
||||||
|
Path,
|
||||||
|
typer.Option(
|
||||||
|
"--storage",
|
||||||
|
"-s",
|
||||||
|
help="Directory for benchmark data storage.",
|
||||||
|
),
|
||||||
|
] = Path("benchmarks"),
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Show benchmark history for a suite.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
veritext benchmark show my_bench --last 10
|
||||||
|
"""
|
||||||
|
bench = Benchmark(name, storage_path=storage_path)
|
||||||
|
runs = bench.get_history(limit=last)
|
||||||
|
|
||||||
|
if not runs:
|
||||||
|
console.print(f"[yellow]No benchmark runs found for '{name}'.[/yellow]")
|
||||||
|
raise typer.Exit(code=0)
|
||||||
|
|
||||||
|
table = format_benchmark_history(runs)
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
|
||||||
|
@benchmark_app.command("check")
|
||||||
|
def benchmark_check(
|
||||||
|
name: Annotated[
|
||||||
|
str,
|
||||||
|
typer.Argument(help="Name of the benchmark suite."),
|
||||||
|
],
|
||||||
|
tolerance: Annotated[
|
||||||
|
float,
|
||||||
|
typer.Option(
|
||||||
|
"--tolerance",
|
||||||
|
"-t",
|
||||||
|
help="Maximum allowed metric drop (e.g., 0.05 = 5%).",
|
||||||
|
),
|
||||||
|
] = 0.05,
|
||||||
|
window: Annotated[
|
||||||
|
int,
|
||||||
|
typer.Option(
|
||||||
|
"--window",
|
||||||
|
"-w",
|
||||||
|
help="Number of historical runs for baseline.",
|
||||||
|
),
|
||||||
|
] = 10,
|
||||||
|
storage_path: Annotated[
|
||||||
|
Path,
|
||||||
|
typer.Option(
|
||||||
|
"--storage",
|
||||||
|
"-s",
|
||||||
|
help="Directory for benchmark data storage.",
|
||||||
|
),
|
||||||
|
] = Path("benchmarks"),
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Check for quality regression against historical baseline.
|
||||||
|
|
||||||
|
Exits with code 1 if regression detected (for CI integration).
|
||||||
|
|
||||||
|
Example:
|
||||||
|
veritext benchmark check my_bench --tolerance 0.05
|
||||||
|
"""
|
||||||
|
bench = Benchmark(name, storage_path=storage_path)
|
||||||
|
report = bench.check_regression(tolerance=tolerance, window=window)
|
||||||
|
|
||||||
|
panel = format_regression_report(report)
|
||||||
|
console.print(panel)
|
||||||
|
|
||||||
|
if report.detected:
|
||||||
|
raise typer.Exit(code=1)
|
||||||
170
src/veritext/cli/formatters.py
Normal file
170
src/veritext/cli/formatters.py
Normal file
@@ -0,0 +1,170 @@
|
|||||||
|
"""Rich output formatters for CLI display."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.panel import Panel
|
||||||
|
from rich.table import Table
|
||||||
|
|
||||||
|
from veritext.benchmark.models import BenchmarkRun, RegressionReport
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
|
||||||
|
def format_validation_table(
|
||||||
|
results: dict[str, float],
|
||||||
|
threshold: float | None = None,
|
||||||
|
) -> Table:
|
||||||
|
"""
|
||||||
|
Format validation results as a Rich table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
results: Dictionary of metric names to scores.
|
||||||
|
threshold: Optional threshold for pass/fail colouring.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Rich Table object.
|
||||||
|
"""
|
||||||
|
table = Table(title="Validation Results", show_header=True, header_style="bold")
|
||||||
|
table.add_column("Metric", style="cyan")
|
||||||
|
table.add_column("Score", justify="right")
|
||||||
|
|
||||||
|
if threshold is not None:
|
||||||
|
table.add_column("Status", justify="center")
|
||||||
|
|
||||||
|
for metric, score in sorted(results.items()):
|
||||||
|
score_str = f"{score:.4f}"
|
||||||
|
|
||||||
|
if threshold is not None:
|
||||||
|
status = "[green]PASS[/green]" if score >= threshold else "[red]FAIL[/red]"
|
||||||
|
table.add_row(metric, score_str, status)
|
||||||
|
else:
|
||||||
|
table.add_row(metric, score_str)
|
||||||
|
|
||||||
|
return table
|
||||||
|
|
||||||
|
|
||||||
|
def format_validation_json(results: dict[str, float]) -> str:
|
||||||
|
"""
|
||||||
|
Format validation results as JSON.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
results: Dictionary of metric names to scores.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON string.
|
||||||
|
"""
|
||||||
|
return json.dumps(results, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
def format_validation_simple(results: dict[str, float]) -> str:
|
||||||
|
"""
|
||||||
|
Format validation results as simple text output.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
results: Dictionary of metric names to scores.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Simple text string with one metric per line.
|
||||||
|
"""
|
||||||
|
lines = [f"{metric}: {score:.4f}" for metric, score in sorted(results.items())]
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def format_benchmark_history(runs: list[BenchmarkRun]) -> Table:
|
||||||
|
"""
|
||||||
|
Format benchmark run history as a Rich table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
runs: List of BenchmarkRun objects (most recent first).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Rich Table object.
|
||||||
|
"""
|
||||||
|
if not runs:
|
||||||
|
table = Table(title="Benchmark History")
|
||||||
|
table.add_column("No runs found")
|
||||||
|
return table
|
||||||
|
|
||||||
|
# Get all metric names from the runs
|
||||||
|
metric_names: set[str] = set()
|
||||||
|
for run in runs:
|
||||||
|
metric_names.update(run.metrics.keys())
|
||||||
|
sorted_metrics = sorted(metric_names)
|
||||||
|
|
||||||
|
table = Table(title="Benchmark History", show_header=True, header_style="bold")
|
||||||
|
table.add_column("Timestamp", style="cyan")
|
||||||
|
table.add_column("Samples", justify="right")
|
||||||
|
for metric in sorted_metrics:
|
||||||
|
table.add_column(metric, justify="right")
|
||||||
|
|
||||||
|
for run in runs:
|
||||||
|
timestamp = run.timestamp.strftime("%Y-%m-%d %H:%M")
|
||||||
|
samples = str(run.sample_count)
|
||||||
|
metric_values = [f"{run.metrics.get(m, 0.0):.4f}" for m in sorted_metrics]
|
||||||
|
table.add_row(timestamp, samples, *metric_values)
|
||||||
|
|
||||||
|
return table
|
||||||
|
|
||||||
|
|
||||||
|
def format_regression_report(report: RegressionReport) -> Panel:
|
||||||
|
"""
|
||||||
|
Format a regression report as a Rich panel.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
report: RegressionReport object.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Rich Panel object with formatted report.
|
||||||
|
"""
|
||||||
|
if not report.detected:
|
||||||
|
content = (
|
||||||
|
f"[green]No regression detected.[/green]\nTolerance: {report.tolerance:.2%}"
|
||||||
|
)
|
||||||
|
return Panel(content, title="Regression Check", border_style="green")
|
||||||
|
|
||||||
|
# Build regression details
|
||||||
|
lines = [
|
||||||
|
"[red]Regression detected![/red]",
|
||||||
|
f"Tolerance: {report.tolerance:.2%}",
|
||||||
|
"",
|
||||||
|
"Metric details:",
|
||||||
|
]
|
||||||
|
|
||||||
|
for metric in sorted(report.deltas.keys()):
|
||||||
|
baseline = report.baseline.get(metric, 0.0)
|
||||||
|
current = report.current.get(metric, 0.0)
|
||||||
|
delta = report.deltas[metric]
|
||||||
|
|
||||||
|
if delta < -report.tolerance:
|
||||||
|
status = "[red]REGRESSED[/red]"
|
||||||
|
else:
|
||||||
|
status = "[green]OK[/green]"
|
||||||
|
|
||||||
|
lines.append(
|
||||||
|
f" {metric}: {current:.4f} (baseline: {baseline:.4f}, "
|
||||||
|
f"delta: {delta:+.4f}) {status}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return Panel("\n".join(lines), title="Regression Check", border_style="red")
|
||||||
|
|
||||||
|
|
||||||
|
def print_validation_output(
|
||||||
|
results: dict[str, float],
|
||||||
|
output_format: str = "table",
|
||||||
|
threshold: float | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Print validation results in the specified format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
results: Dictionary of metric names to scores.
|
||||||
|
output_format: Output format ('table', 'json', or 'simple').
|
||||||
|
threshold: Optional threshold for pass/fail colouring (table only).
|
||||||
|
"""
|
||||||
|
if output_format == "json":
|
||||||
|
console.print(format_validation_json(results))
|
||||||
|
elif output_format == "simple":
|
||||||
|
console.print(format_validation_simple(results))
|
||||||
|
else:
|
||||||
|
console.print(format_validation_table(results, threshold))
|
||||||
37
src/veritext/cli/main.py
Normal file
37
src/veritext/cli/main.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
"""Veritext CLI entry point."""
|
||||||
|
|
||||||
|
import typer
|
||||||
|
|
||||||
|
import veritext
|
||||||
|
from veritext.cli.benchmark import benchmark_app
|
||||||
|
from veritext.cli.validate import validate
|
||||||
|
|
||||||
|
app = typer.Typer(
|
||||||
|
name="veritext",
|
||||||
|
help="Semantic text validation framework.",
|
||||||
|
no_args_is_help=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Register commands
|
||||||
|
app.command()(validate)
|
||||||
|
app.add_typer(benchmark_app)
|
||||||
|
|
||||||
|
|
||||||
|
@app.callback(invoke_without_command=True)
|
||||||
|
def main(
|
||||||
|
version: bool | None = typer.Option(
|
||||||
|
None,
|
||||||
|
"--version",
|
||||||
|
"-V",
|
||||||
|
help="Show version and exit.",
|
||||||
|
is_eager=True,
|
||||||
|
),
|
||||||
|
) -> None:
|
||||||
|
"""Veritext: Semantic text validation framework for Python."""
|
||||||
|
if version:
|
||||||
|
typer.echo(f"veritext {veritext.__version__}")
|
||||||
|
raise typer.Exit()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app()
|
||||||
120
src/veritext/cli/readers.py
Normal file
120
src/veritext/cli/readers.py
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
"""Input readers for CLI operations."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TextPair:
|
||||||
|
"""A candidate-reference text pair for validation."""
|
||||||
|
|
||||||
|
candidate: str
|
||||||
|
reference: str
|
||||||
|
|
||||||
|
|
||||||
|
def read_jsonl(path: Path) -> list[TextPair]:
|
||||||
|
"""
|
||||||
|
Read text pairs from a JSONL file.
|
||||||
|
|
||||||
|
Each line must be a JSON object with 'candidate' and 'reference' keys.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to the JSONL file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of TextPair objects.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: If the file does not exist.
|
||||||
|
ValueError: If any line is malformed or missing required keys.
|
||||||
|
"""
|
||||||
|
if not path.exists():
|
||||||
|
raise FileNotFoundError(f"File not found: {path}")
|
||||||
|
|
||||||
|
pairs: list[TextPair] = []
|
||||||
|
with path.open() as f:
|
||||||
|
for line_num, line in enumerate(f, start=1):
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
raise ValueError(f"Invalid JSON on line {line_num}: {e}") from e
|
||||||
|
|
||||||
|
if "candidate" not in data:
|
||||||
|
raise ValueError(f"Missing 'candidate' key on line {line_num}")
|
||||||
|
if "reference" not in data:
|
||||||
|
raise ValueError(f"Missing 'reference' key on line {line_num}")
|
||||||
|
|
||||||
|
pairs.append(
|
||||||
|
TextPair(
|
||||||
|
candidate=str(data["candidate"]),
|
||||||
|
reference=str(data["reference"]),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return pairs
|
||||||
|
|
||||||
|
|
||||||
|
def read_paired_jsonl(candidates_path: Path, references_path: Path) -> list[TextPair]:
|
||||||
|
"""
|
||||||
|
Read text pairs from separate candidate and reference JSONL files.
|
||||||
|
|
||||||
|
Each file should contain one JSON object per line with a 'text' key.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
candidates_path: Path to the candidates JSONL file.
|
||||||
|
references_path: Path to the references JSONL file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of TextPair objects.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: If either file does not exist.
|
||||||
|
ValueError: If files have different lengths or are malformed.
|
||||||
|
"""
|
||||||
|
candidates = _read_text_jsonl(candidates_path, "candidates")
|
||||||
|
references = _read_text_jsonl(references_path, "references")
|
||||||
|
|
||||||
|
if len(candidates) != len(references):
|
||||||
|
raise ValueError(
|
||||||
|
f"Number of candidates ({len(candidates)}) does not match "
|
||||||
|
f"number of references ({len(references)})"
|
||||||
|
)
|
||||||
|
|
||||||
|
return [
|
||||||
|
TextPair(candidate=c, reference=r)
|
||||||
|
for c, r in zip(candidates, references, strict=True)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _read_text_jsonl(path: Path, label: str) -> list[str]:
|
||||||
|
"""Read text values from a JSONL file with 'text' key per line."""
|
||||||
|
if not path.exists():
|
||||||
|
raise FileNotFoundError(f"{label.capitalize()} file not found: {path}")
|
||||||
|
|
||||||
|
texts: list[str] = []
|
||||||
|
with path.open() as f:
|
||||||
|
for line_num, line in enumerate(f, start=1):
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid JSON in {label} file on line {line_num}: {e}"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
if "text" not in data:
|
||||||
|
raise ValueError(
|
||||||
|
f"Missing 'text' key in {label} file on line {line_num}"
|
||||||
|
)
|
||||||
|
|
||||||
|
texts.append(str(data["text"]))
|
||||||
|
|
||||||
|
return texts
|
||||||
213
src/veritext/cli/validate.py
Normal file
213
src/veritext/cli/validate.py
Normal file
@@ -0,0 +1,213 @@
|
|||||||
|
"""Validate command for computing text metrics."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
import typer
|
||||||
|
|
||||||
|
from veritext.cli.formatters import console, print_validation_output
|
||||||
|
from veritext.cli.readers import read_jsonl, read_paired_jsonl
|
||||||
|
from veritext.metrics.bleu import Bleu
|
||||||
|
from veritext.metrics.lexical import Lexical
|
||||||
|
from veritext.metrics.rouge import Rouge
|
||||||
|
|
||||||
|
# Available metrics mapped to their computation functions
|
||||||
|
AVAILABLE_METRICS = frozenset(
|
||||||
|
{"bleu", "bleu1", "bleu2", "bleu3", "bleu4", "rouge", "rouge_l", "lexical"}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_metrics(
|
||||||
|
candidate: str,
|
||||||
|
reference: str,
|
||||||
|
metric_names: list[str],
|
||||||
|
) -> dict[str, float]:
|
||||||
|
"""Compute requested metrics for a single text pair."""
|
||||||
|
results: dict[str, float] = {}
|
||||||
|
bleu = Bleu()
|
||||||
|
rouge = Rouge()
|
||||||
|
lexical = Lexical()
|
||||||
|
|
||||||
|
for metric in metric_names:
|
||||||
|
if metric == "bleu" or metric == "bleu4":
|
||||||
|
bleu_result = bleu.score(candidate, reference)
|
||||||
|
results["bleu4"] = bleu_result.bleu4
|
||||||
|
elif metric == "bleu1":
|
||||||
|
bleu_result = bleu.score(candidate, reference)
|
||||||
|
results["bleu1"] = bleu_result.bleu1
|
||||||
|
elif metric == "bleu2":
|
||||||
|
bleu_result = bleu.score(candidate, reference)
|
||||||
|
results["bleu2"] = bleu_result.bleu2
|
||||||
|
elif metric == "bleu3":
|
||||||
|
bleu_result = bleu.score(candidate, reference)
|
||||||
|
results["bleu3"] = bleu_result.bleu3
|
||||||
|
elif metric == "rouge" or metric == "rouge_l":
|
||||||
|
rouge_result = rouge.score(candidate, reference)
|
||||||
|
results["rouge_l"] = rouge_result.rouge_l.fmeasure
|
||||||
|
elif metric == "lexical":
|
||||||
|
lexical_result = lexical.score(candidate, reference)
|
||||||
|
results["jaccard"] = lexical_result.jaccard
|
||||||
|
results["token_overlap"] = lexical_result.token_overlap
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_batch_metrics(
|
||||||
|
candidates: list[str],
|
||||||
|
references: list[str],
|
||||||
|
metric_names: list[str],
|
||||||
|
) -> dict[str, float]:
|
||||||
|
"""Compute average metrics for a batch of text pairs."""
|
||||||
|
bleu = Bleu()
|
||||||
|
rouge = Rouge()
|
||||||
|
lexical = Lexical()
|
||||||
|
|
||||||
|
results: dict[str, float] = {}
|
||||||
|
|
||||||
|
for metric in metric_names:
|
||||||
|
if metric == "bleu" or metric == "bleu4":
|
||||||
|
bleu_batch = bleu.batch_score(candidates, references)
|
||||||
|
stats = bleu_batch.stats.get("bleu4")
|
||||||
|
if stats:
|
||||||
|
results["bleu4"] = stats.mean
|
||||||
|
elif metric == "bleu1":
|
||||||
|
bleu_batch = bleu.batch_score(candidates, references)
|
||||||
|
stats = bleu_batch.stats.get("bleu1")
|
||||||
|
if stats:
|
||||||
|
results["bleu1"] = stats.mean
|
||||||
|
elif metric == "bleu2":
|
||||||
|
bleu_batch = bleu.batch_score(candidates, references)
|
||||||
|
stats = bleu_batch.stats.get("bleu2")
|
||||||
|
if stats:
|
||||||
|
results["bleu2"] = stats.mean
|
||||||
|
elif metric == "bleu3":
|
||||||
|
bleu_batch = bleu.batch_score(candidates, references)
|
||||||
|
stats = bleu_batch.stats.get("bleu3")
|
||||||
|
if stats:
|
||||||
|
results["bleu3"] = stats.mean
|
||||||
|
elif metric == "rouge" or metric == "rouge_l":
|
||||||
|
rouge_batch = rouge.batch_score(candidates, references)
|
||||||
|
stats = rouge_batch.stats.get("rouge_l_fmeasure")
|
||||||
|
if stats:
|
||||||
|
results["rouge_l"] = stats.mean
|
||||||
|
elif metric == "lexical":
|
||||||
|
lexical_batch = lexical.batch_score(candidates, references)
|
||||||
|
jaccard_stats = lexical_batch.stats.get("jaccard")
|
||||||
|
overlap_stats = lexical_batch.stats.get("token_overlap")
|
||||||
|
if jaccard_stats:
|
||||||
|
results["jaccard"] = jaccard_stats.mean
|
||||||
|
if overlap_stats:
|
||||||
|
results["token_overlap"] = overlap_stats.mean
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_metrics(metrics_str: str) -> list[str]:
|
||||||
|
"""Parse comma-separated metric names."""
|
||||||
|
metrics = [m.strip().lower() for m in metrics_str.split(",")]
|
||||||
|
|
||||||
|
# Validate metric names
|
||||||
|
invalid = [m for m in metrics if m not in AVAILABLE_METRICS]
|
||||||
|
if invalid:
|
||||||
|
raise typer.BadParameter(
|
||||||
|
f"Unknown metrics: {', '.join(invalid)}. "
|
||||||
|
f"Available: {', '.join(sorted(AVAILABLE_METRICS))}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return metrics
|
||||||
|
|
||||||
|
|
||||||
|
def validate(
|
||||||
|
text: Annotated[
|
||||||
|
str | None,
|
||||||
|
typer.Argument(help="Candidate text to validate (inline mode)."),
|
||||||
|
] = None,
|
||||||
|
reference: Annotated[
|
||||||
|
str | None,
|
||||||
|
typer.Option("--reference", "-r", help="Reference text for comparison."),
|
||||||
|
] = None,
|
||||||
|
file: Annotated[
|
||||||
|
Path | None,
|
||||||
|
typer.Option("--file", "-f", help="JSONL file with candidate/reference pairs."),
|
||||||
|
] = None,
|
||||||
|
reference_file: Annotated[
|
||||||
|
Path | None,
|
||||||
|
typer.Option(
|
||||||
|
"--reference-file",
|
||||||
|
"-R",
|
||||||
|
help="Separate JSONL file with references (requires --file).",
|
||||||
|
),
|
||||||
|
] = None,
|
||||||
|
metrics: Annotated[
|
||||||
|
str,
|
||||||
|
typer.Option(
|
||||||
|
"--metrics",
|
||||||
|
"-m",
|
||||||
|
help="Comma-separated metrics: bleu, bleu1-4, rouge, rouge_l, lexical.",
|
||||||
|
),
|
||||||
|
] = "bleu,rouge",
|
||||||
|
output: Annotated[
|
||||||
|
str,
|
||||||
|
typer.Option("--output", "-o", help="Output format: table, json, or simple."),
|
||||||
|
] = "table",
|
||||||
|
threshold: Annotated[
|
||||||
|
float | None,
|
||||||
|
typer.Option("--threshold", "-t", help="Score threshold for pass/fail status."),
|
||||||
|
] = None,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Validate text quality using various metrics.
|
||||||
|
|
||||||
|
Use inline mode for single texts:
|
||||||
|
veritext validate "text" -r "reference" -m bleu,rouge
|
||||||
|
|
||||||
|
Use file mode for batches:
|
||||||
|
veritext validate -f outputs.jsonl -m bleu,rouge
|
||||||
|
"""
|
||||||
|
# Parse and validate metric names
|
||||||
|
try:
|
||||||
|
metric_names = _parse_metrics(metrics)
|
||||||
|
except typer.BadParameter as e:
|
||||||
|
console.print(f"[red]Error:[/red] {e}")
|
||||||
|
raise typer.Exit(code=1) from e
|
||||||
|
|
||||||
|
# Validate output format
|
||||||
|
if output not in ("table", "json", "simple"):
|
||||||
|
console.print(f"[red]Error:[/red] Invalid output format: {output}")
|
||||||
|
raise typer.Exit(code=1)
|
||||||
|
|
||||||
|
# Determine mode: inline vs file
|
||||||
|
if file is not None:
|
||||||
|
# File mode
|
||||||
|
try:
|
||||||
|
if reference_file is not None:
|
||||||
|
pairs = read_paired_jsonl(file, reference_file)
|
||||||
|
else:
|
||||||
|
pairs = read_jsonl(file)
|
||||||
|
except (FileNotFoundError, ValueError) as e:
|
||||||
|
console.print(f"[red]Error:[/red] {e}")
|
||||||
|
raise typer.Exit(code=1) from e
|
||||||
|
|
||||||
|
if not pairs:
|
||||||
|
console.print("[yellow]Warning:[/yellow] No text pairs found in file.")
|
||||||
|
raise typer.Exit(code=0)
|
||||||
|
|
||||||
|
candidates = [p.candidate for p in pairs]
|
||||||
|
references = [p.reference for p in pairs]
|
||||||
|
|
||||||
|
results = _compute_batch_metrics(candidates, references, metric_names)
|
||||||
|
console.print(f"[dim]Evaluated {len(pairs)} text pairs.[/dim]\n")
|
||||||
|
|
||||||
|
elif text is not None and reference is not None:
|
||||||
|
# Inline mode
|
||||||
|
results = _compute_metrics(text, reference, metric_names)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Invalid usage
|
||||||
|
console.print(
|
||||||
|
"[red]Error:[/red] Provide either text and --reference, "
|
||||||
|
"or --file for batch mode."
|
||||||
|
)
|
||||||
|
raise typer.Exit(code=1)
|
||||||
|
|
||||||
|
print_validation_output(results, output, threshold)
|
||||||
1
tests/test_cli/__init__.py
Normal file
1
tests/test_cli/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
"""CLI test suite."""
|
||||||
337
tests/test_cli/test_benchmark.py
Normal file
337
tests/test_cli/test_benchmark.py
Normal file
@@ -0,0 +1,337 @@
|
|||||||
|
"""Tests for CLI benchmark commands."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from typer.testing import CliRunner
|
||||||
|
|
||||||
|
from veritext.cli.main import app
|
||||||
|
|
||||||
|
runner = CliRunner()
|
||||||
|
|
||||||
|
|
||||||
|
class TestBenchmarkRun:
|
||||||
|
"""Tests for benchmark run command."""
|
||||||
|
|
||||||
|
def test_benchmark_run_basic(self, tmp_path: Path) -> None:
|
||||||
|
"""Test basic benchmark run."""
|
||||||
|
data_file = tmp_path / "data.jsonl"
|
||||||
|
data_file.write_text(
|
||||||
|
'{"candidate": "hello world today", "reference": "hello world today"}\n'
|
||||||
|
'{"candidate": "foo bar baz qux", "reference": "foo bar baz qux"}'
|
||||||
|
)
|
||||||
|
storage_path = tmp_path / "benchmarks"
|
||||||
|
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"run",
|
||||||
|
"test_bench",
|
||||||
|
"-f",
|
||||||
|
str(data_file),
|
||||||
|
"-m",
|
||||||
|
"rouge_l,bleu4",
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "Benchmark 'test_bench' completed" in result.stdout
|
||||||
|
assert "Samples: 2" in result.stdout
|
||||||
|
assert "rouge_l:" in result.stdout
|
||||||
|
assert "bleu4:" in result.stdout
|
||||||
|
|
||||||
|
def test_benchmark_run_file_not_found(self, tmp_path: Path) -> None:
|
||||||
|
"""Test benchmark run with non-existent file."""
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"run",
|
||||||
|
"test_bench",
|
||||||
|
"-f",
|
||||||
|
"/nonexistent/file.jsonl",
|
||||||
|
"-s",
|
||||||
|
str(tmp_path / "benchmarks"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "Error" in result.stdout
|
||||||
|
|
||||||
|
def test_benchmark_run_creates_storage(self, tmp_path: Path) -> None:
|
||||||
|
"""Test that benchmark run creates storage directory."""
|
||||||
|
data_file = tmp_path / "data.jsonl"
|
||||||
|
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
||||||
|
storage_path = tmp_path / "new_benchmarks"
|
||||||
|
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"run",
|
||||||
|
"test_bench",
|
||||||
|
"-f",
|
||||||
|
str(data_file),
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert storage_path.exists()
|
||||||
|
|
||||||
|
|
||||||
|
class TestBenchmarkShow:
|
||||||
|
"""Tests for benchmark show command."""
|
||||||
|
|
||||||
|
def test_benchmark_show_no_runs(self, tmp_path: Path) -> None:
|
||||||
|
"""Test showing benchmark with no runs."""
|
||||||
|
storage_path = tmp_path / "benchmarks"
|
||||||
|
storage_path.mkdir()
|
||||||
|
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"show",
|
||||||
|
"nonexistent_bench",
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "No benchmark runs found" in result.stdout
|
||||||
|
|
||||||
|
def test_benchmark_show_with_runs(self, tmp_path: Path) -> None:
|
||||||
|
"""Test showing benchmark history with runs."""
|
||||||
|
# First create some runs
|
||||||
|
data_file = tmp_path / "data.jsonl"
|
||||||
|
data_file.write_text('{"candidate": "hello world", "reference": "hello world"}')
|
||||||
|
storage_path = tmp_path / "benchmarks"
|
||||||
|
|
||||||
|
# Run benchmark twice
|
||||||
|
for _ in range(2):
|
||||||
|
runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"run",
|
||||||
|
"test_bench",
|
||||||
|
"-f",
|
||||||
|
str(data_file),
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Show history
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"show",
|
||||||
|
"test_bench",
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "Benchmark History" in result.stdout
|
||||||
|
|
||||||
|
def test_benchmark_show_limit(self, tmp_path: Path) -> None:
|
||||||
|
"""Test showing limited benchmark history."""
|
||||||
|
data_file = tmp_path / "data.jsonl"
|
||||||
|
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
||||||
|
storage_path = tmp_path / "benchmarks"
|
||||||
|
|
||||||
|
# Run benchmark 3 times
|
||||||
|
for _ in range(3):
|
||||||
|
runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"run",
|
||||||
|
"test_bench",
|
||||||
|
"-f",
|
||||||
|
str(data_file),
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Show only last 2
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"show",
|
||||||
|
"test_bench",
|
||||||
|
"--last",
|
||||||
|
"2",
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestBenchmarkCheck:
|
||||||
|
"""Tests for benchmark check command."""
|
||||||
|
|
||||||
|
def test_benchmark_check_no_regression(self, tmp_path: Path) -> None:
|
||||||
|
"""Test checking for regression with no regression."""
|
||||||
|
data_file = tmp_path / "data.jsonl"
|
||||||
|
data_file.write_text(
|
||||||
|
'{"candidate": "hello world today", "reference": "hello world today"}'
|
||||||
|
)
|
||||||
|
storage_path = tmp_path / "benchmarks"
|
||||||
|
|
||||||
|
# Run benchmark twice with same data (no regression)
|
||||||
|
for _ in range(2):
|
||||||
|
runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"run",
|
||||||
|
"test_bench",
|
||||||
|
"-f",
|
||||||
|
str(data_file),
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check for regression
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"check",
|
||||||
|
"test_bench",
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "No regression detected" in result.stdout
|
||||||
|
|
||||||
|
def test_benchmark_check_with_regression(self, tmp_path: Path) -> None:
|
||||||
|
"""Test checking for regression when regression occurs."""
|
||||||
|
storage_path = tmp_path / "benchmarks"
|
||||||
|
|
||||||
|
# First run with good data
|
||||||
|
good_file = tmp_path / "good.jsonl"
|
||||||
|
good_file.write_text(
|
||||||
|
'{"candidate": "hello world today", "reference": "hello world today"}'
|
||||||
|
)
|
||||||
|
runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"run",
|
||||||
|
"test_bench",
|
||||||
|
"-f",
|
||||||
|
str(good_file),
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Second run with bad data (regression)
|
||||||
|
bad_file = tmp_path / "bad.jsonl"
|
||||||
|
bad_file.write_text(
|
||||||
|
'{"candidate": "completely different", "reference": "hello world today"}'
|
||||||
|
)
|
||||||
|
runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"run",
|
||||||
|
"test_bench",
|
||||||
|
"-f",
|
||||||
|
str(bad_file),
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check for regression
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"check",
|
||||||
|
"test_bench",
|
||||||
|
"-t",
|
||||||
|
"0.05",
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "Regression detected" in result.stdout
|
||||||
|
|
||||||
|
def test_benchmark_check_custom_tolerance(self, tmp_path: Path) -> None:
|
||||||
|
"""Test checking regression with custom tolerance."""
|
||||||
|
data_file = tmp_path / "data.jsonl"
|
||||||
|
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
||||||
|
storage_path = tmp_path / "benchmarks"
|
||||||
|
|
||||||
|
runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"run",
|
||||||
|
"test_bench",
|
||||||
|
"-f",
|
||||||
|
str(data_file),
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"check",
|
||||||
|
"test_bench",
|
||||||
|
"--tolerance",
|
||||||
|
"0.10",
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "10.00%" in result.stdout
|
||||||
|
|
||||||
|
|
||||||
|
class TestBenchmarkHelp:
|
||||||
|
"""Tests for benchmark help output."""
|
||||||
|
|
||||||
|
def test_benchmark_help(self) -> None:
|
||||||
|
"""Test benchmark help output."""
|
||||||
|
result = runner.invoke(app, ["benchmark", "--help"])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "run" in result.stdout
|
||||||
|
assert "show" in result.stdout
|
||||||
|
assert "check" in result.stdout
|
||||||
|
|
||||||
|
def test_benchmark_run_help(self) -> None:
|
||||||
|
"""Test benchmark run help output."""
|
||||||
|
result = runner.invoke(app, ["benchmark", "run", "--help"])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "--file" in result.stdout
|
||||||
|
assert "--metrics" in result.stdout
|
||||||
|
|
||||||
|
def test_benchmark_show_help(self) -> None:
|
||||||
|
"""Test benchmark show help output."""
|
||||||
|
result = runner.invoke(app, ["benchmark", "show", "--help"])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "--last" in result.stdout
|
||||||
|
|
||||||
|
def test_benchmark_check_help(self) -> None:
|
||||||
|
"""Test benchmark check help output."""
|
||||||
|
result = runner.invoke(app, ["benchmark", "check", "--help"])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "--tolerance" in result.stdout
|
||||||
|
assert "--window" in result.stdout
|
||||||
141
tests/test_cli/test_formatters.py
Normal file
141
tests/test_cli/test_formatters.py
Normal file
@@ -0,0 +1,141 @@
|
|||||||
|
"""Tests for CLI output formatters."""
|
||||||
|
|
||||||
|
from datetime import UTC, datetime
|
||||||
|
|
||||||
|
from veritext.benchmark.models import BenchmarkRun, RegressionReport
|
||||||
|
from veritext.cli.formatters import (
|
||||||
|
format_benchmark_history,
|
||||||
|
format_regression_report,
|
||||||
|
format_validation_json,
|
||||||
|
format_validation_simple,
|
||||||
|
format_validation_table,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestFormatValidationTable:
|
||||||
|
"""Tests for format_validation_table function."""
|
||||||
|
|
||||||
|
def test_format_empty_results(self) -> None:
|
||||||
|
"""Test formatting empty results."""
|
||||||
|
table = format_validation_table({})
|
||||||
|
assert table.title == "Validation Results"
|
||||||
|
assert table.row_count == 0
|
||||||
|
|
||||||
|
def test_format_single_metric(self) -> None:
|
||||||
|
"""Test formatting a single metric."""
|
||||||
|
results = {"bleu4": 0.8523}
|
||||||
|
table = format_validation_table(results)
|
||||||
|
assert table.row_count == 1
|
||||||
|
|
||||||
|
def test_format_multiple_metrics(self) -> None:
|
||||||
|
"""Test formatting multiple metrics."""
|
||||||
|
results = {"bleu4": 0.85, "rouge_l": 0.92, "jaccard": 0.75}
|
||||||
|
table = format_validation_table(results)
|
||||||
|
assert table.row_count == 3
|
||||||
|
|
||||||
|
def test_format_with_threshold(self) -> None:
|
||||||
|
"""Test formatting with threshold for pass/fail."""
|
||||||
|
results = {"bleu4": 0.85, "rouge_l": 0.45}
|
||||||
|
table = format_validation_table(results, threshold=0.5)
|
||||||
|
# Should have 3 columns: Metric, Score, Status
|
||||||
|
assert table.row_count == 2
|
||||||
|
|
||||||
|
|
||||||
|
class TestFormatValidationJson:
|
||||||
|
"""Tests for format_validation_json function."""
|
||||||
|
|
||||||
|
def test_format_empty_results(self) -> None:
|
||||||
|
"""Test formatting empty results as JSON."""
|
||||||
|
result = format_validation_json({})
|
||||||
|
assert result == "{}"
|
||||||
|
|
||||||
|
def test_format_results(self) -> None:
|
||||||
|
"""Test formatting results as JSON."""
|
||||||
|
results = {"bleu4": 0.85, "rouge_l": 0.92}
|
||||||
|
result = format_validation_json(results)
|
||||||
|
assert '"bleu4": 0.85' in result
|
||||||
|
assert '"rouge_l": 0.92' in result
|
||||||
|
|
||||||
|
|
||||||
|
class TestFormatValidationSimple:
|
||||||
|
"""Tests for format_validation_simple function."""
|
||||||
|
|
||||||
|
def test_format_empty_results(self) -> None:
|
||||||
|
"""Test formatting empty results as simple text."""
|
||||||
|
result = format_validation_simple({})
|
||||||
|
assert result == ""
|
||||||
|
|
||||||
|
def test_format_results(self) -> None:
|
||||||
|
"""Test formatting results as simple text."""
|
||||||
|
results = {"bleu4": 0.8523, "rouge_l": 0.9234}
|
||||||
|
result = format_validation_simple(results)
|
||||||
|
assert "bleu4: 0.8523" in result
|
||||||
|
assert "rouge_l: 0.9234" in result
|
||||||
|
|
||||||
|
|
||||||
|
class TestFormatBenchmarkHistory:
|
||||||
|
"""Tests for format_benchmark_history function."""
|
||||||
|
|
||||||
|
def test_format_empty_history(self) -> None:
|
||||||
|
"""Test formatting empty benchmark history."""
|
||||||
|
table = format_benchmark_history([])
|
||||||
|
assert table.title == "Benchmark History"
|
||||||
|
|
||||||
|
def test_format_single_run(self) -> None:
|
||||||
|
"""Test formatting a single benchmark run."""
|
||||||
|
run = BenchmarkRun(
|
||||||
|
id="test-id",
|
||||||
|
benchmark_name="test",
|
||||||
|
timestamp=datetime(2024, 1, 15, 10, 30, tzinfo=UTC),
|
||||||
|
veritext_version="0.1.0",
|
||||||
|
metrics={"rouge_l": 0.85, "bleu4": 0.72},
|
||||||
|
sample_count=100,
|
||||||
|
)
|
||||||
|
table = format_benchmark_history([run])
|
||||||
|
assert table.row_count == 1
|
||||||
|
|
||||||
|
def test_format_multiple_runs(self) -> None:
|
||||||
|
"""Test formatting multiple benchmark runs."""
|
||||||
|
runs = [
|
||||||
|
BenchmarkRun(
|
||||||
|
id=f"test-id-{i}",
|
||||||
|
benchmark_name="test",
|
||||||
|
timestamp=datetime(2024, 1, i + 1, 10, 30, tzinfo=UTC),
|
||||||
|
veritext_version="0.1.0",
|
||||||
|
metrics={"rouge_l": 0.8 + i * 0.01},
|
||||||
|
sample_count=100,
|
||||||
|
)
|
||||||
|
for i in range(3)
|
||||||
|
]
|
||||||
|
table = format_benchmark_history(runs)
|
||||||
|
assert table.row_count == 3
|
||||||
|
|
||||||
|
|
||||||
|
class TestFormatRegressionReport:
|
||||||
|
"""Tests for format_regression_report function."""
|
||||||
|
|
||||||
|
def test_format_no_regression(self) -> None:
|
||||||
|
"""Test formatting report with no regression."""
|
||||||
|
report = RegressionReport(
|
||||||
|
detected=False,
|
||||||
|
baseline={"rouge_l": 0.85},
|
||||||
|
current={"rouge_l": 0.86},
|
||||||
|
deltas={"rouge_l": 0.01},
|
||||||
|
tolerance=0.05,
|
||||||
|
)
|
||||||
|
panel = format_regression_report(report)
|
||||||
|
assert panel.title == "Regression Check"
|
||||||
|
assert panel.border_style == "green"
|
||||||
|
|
||||||
|
def test_format_with_regression(self) -> None:
|
||||||
|
"""Test formatting report with regression detected."""
|
||||||
|
report = RegressionReport(
|
||||||
|
detected=True,
|
||||||
|
baseline={"rouge_l": 0.85, "bleu4": 0.72},
|
||||||
|
current={"rouge_l": 0.70, "bleu4": 0.70},
|
||||||
|
deltas={"rouge_l": -0.15, "bleu4": -0.02},
|
||||||
|
tolerance=0.05,
|
||||||
|
)
|
||||||
|
panel = format_regression_report(report)
|
||||||
|
assert panel.title == "Regression Check"
|
||||||
|
assert panel.border_style == "red"
|
||||||
145
tests/test_cli/test_readers.py
Normal file
145
tests/test_cli/test_readers.py
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
"""Tests for CLI input readers."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from veritext.cli.readers import TextPair, read_jsonl, read_paired_jsonl
|
||||||
|
|
||||||
|
|
||||||
|
class TestTextPair:
|
||||||
|
"""Tests for TextPair dataclass."""
|
||||||
|
|
||||||
|
def test_create_text_pair(self) -> None:
|
||||||
|
"""Test creating a TextPair."""
|
||||||
|
pair = TextPair(candidate="hello", reference="world")
|
||||||
|
assert pair.candidate == "hello"
|
||||||
|
assert pair.reference == "world"
|
||||||
|
|
||||||
|
|
||||||
|
class TestReadJsonl:
|
||||||
|
"""Tests for read_jsonl function."""
|
||||||
|
|
||||||
|
def test_read_valid_jsonl(self, tmp_path: Path) -> None:
|
||||||
|
"""Test reading a valid JSONL file."""
|
||||||
|
data = [
|
||||||
|
{"candidate": "foo", "reference": "bar"},
|
||||||
|
{"candidate": "baz", "reference": "qux"},
|
||||||
|
]
|
||||||
|
jsonl_file = tmp_path / "data.jsonl"
|
||||||
|
jsonl_file.write_text("\n".join(json.dumps(d) for d in data))
|
||||||
|
|
||||||
|
pairs = read_jsonl(jsonl_file)
|
||||||
|
|
||||||
|
assert len(pairs) == 2
|
||||||
|
assert pairs[0].candidate == "foo"
|
||||||
|
assert pairs[0].reference == "bar"
|
||||||
|
assert pairs[1].candidate == "baz"
|
||||||
|
assert pairs[1].reference == "qux"
|
||||||
|
|
||||||
|
def test_read_empty_file(self, tmp_path: Path) -> None:
|
||||||
|
"""Test reading an empty JSONL file."""
|
||||||
|
jsonl_file = tmp_path / "empty.jsonl"
|
||||||
|
jsonl_file.write_text("")
|
||||||
|
|
||||||
|
pairs = read_jsonl(jsonl_file)
|
||||||
|
|
||||||
|
assert pairs == []
|
||||||
|
|
||||||
|
def test_read_file_with_blank_lines(self, tmp_path: Path) -> None:
|
||||||
|
"""Test reading a JSONL file with blank lines."""
|
||||||
|
jsonl_file = tmp_path / "data.jsonl"
|
||||||
|
content = '{"candidate": "a", "reference": "b"}\n\n{"candidate": "c", "reference": "d"}\n'
|
||||||
|
jsonl_file.write_text(content)
|
||||||
|
|
||||||
|
pairs = read_jsonl(jsonl_file)
|
||||||
|
|
||||||
|
assert len(pairs) == 2
|
||||||
|
|
||||||
|
def test_read_file_not_found(self, tmp_path: Path) -> None:
|
||||||
|
"""Test reading a non-existent file."""
|
||||||
|
with pytest.raises(FileNotFoundError):
|
||||||
|
read_jsonl(tmp_path / "nonexistent.jsonl")
|
||||||
|
|
||||||
|
def test_read_invalid_json(self, tmp_path: Path) -> None:
|
||||||
|
"""Test reading a file with invalid JSON."""
|
||||||
|
jsonl_file = tmp_path / "invalid.jsonl"
|
||||||
|
jsonl_file.write_text("not valid json")
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="Invalid JSON on line 1"):
|
||||||
|
read_jsonl(jsonl_file)
|
||||||
|
|
||||||
|
def test_read_missing_candidate_key(self, tmp_path: Path) -> None:
|
||||||
|
"""Test reading a file missing the candidate key."""
|
||||||
|
jsonl_file = tmp_path / "data.jsonl"
|
||||||
|
jsonl_file.write_text('{"reference": "bar"}')
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="Missing 'candidate' key on line 1"):
|
||||||
|
read_jsonl(jsonl_file)
|
||||||
|
|
||||||
|
def test_read_missing_reference_key(self, tmp_path: Path) -> None:
|
||||||
|
"""Test reading a file missing the reference key."""
|
||||||
|
jsonl_file = tmp_path / "data.jsonl"
|
||||||
|
jsonl_file.write_text('{"candidate": "foo"}')
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="Missing 'reference' key on line 1"):
|
||||||
|
read_jsonl(jsonl_file)
|
||||||
|
|
||||||
|
|
||||||
|
class TestReadPairedJsonl:
|
||||||
|
"""Tests for read_paired_jsonl function."""
|
||||||
|
|
||||||
|
def test_read_paired_valid(self, tmp_path: Path) -> None:
|
||||||
|
"""Test reading valid paired JSONL files."""
|
||||||
|
candidates_file = tmp_path / "candidates.jsonl"
|
||||||
|
references_file = tmp_path / "references.jsonl"
|
||||||
|
|
||||||
|
candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}')
|
||||||
|
references_file.write_text('{"text": "baz"}\n{"text": "qux"}')
|
||||||
|
|
||||||
|
pairs = read_paired_jsonl(candidates_file, references_file)
|
||||||
|
|
||||||
|
assert len(pairs) == 2
|
||||||
|
assert pairs[0].candidate == "foo"
|
||||||
|
assert pairs[0].reference == "baz"
|
||||||
|
assert pairs[1].candidate == "bar"
|
||||||
|
assert pairs[1].reference == "qux"
|
||||||
|
|
||||||
|
def test_read_paired_length_mismatch(self, tmp_path: Path) -> None:
|
||||||
|
"""Test reading paired files with different lengths."""
|
||||||
|
candidates_file = tmp_path / "candidates.jsonl"
|
||||||
|
references_file = tmp_path / "references.jsonl"
|
||||||
|
|
||||||
|
candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}')
|
||||||
|
references_file.write_text('{"text": "baz"}')
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="does not match"):
|
||||||
|
read_paired_jsonl(candidates_file, references_file)
|
||||||
|
|
||||||
|
def test_read_paired_candidates_not_found(self, tmp_path: Path) -> None:
|
||||||
|
"""Test reading when candidates file doesn't exist."""
|
||||||
|
references_file = tmp_path / "references.jsonl"
|
||||||
|
references_file.write_text('{"text": "baz"}')
|
||||||
|
|
||||||
|
with pytest.raises(FileNotFoundError, match="Candidates file not found"):
|
||||||
|
read_paired_jsonl(tmp_path / "nonexistent.jsonl", references_file)
|
||||||
|
|
||||||
|
def test_read_paired_references_not_found(self, tmp_path: Path) -> None:
|
||||||
|
"""Test reading when references file doesn't exist."""
|
||||||
|
candidates_file = tmp_path / "candidates.jsonl"
|
||||||
|
candidates_file.write_text('{"text": "foo"}')
|
||||||
|
|
||||||
|
with pytest.raises(FileNotFoundError, match="References file not found"):
|
||||||
|
read_paired_jsonl(candidates_file, tmp_path / "nonexistent.jsonl")
|
||||||
|
|
||||||
|
def test_read_paired_missing_text_key(self, tmp_path: Path) -> None:
|
||||||
|
"""Test reading paired files with missing text key."""
|
||||||
|
candidates_file = tmp_path / "candidates.jsonl"
|
||||||
|
references_file = tmp_path / "references.jsonl"
|
||||||
|
|
||||||
|
candidates_file.write_text('{"value": "foo"}')
|
||||||
|
references_file.write_text('{"text": "baz"}')
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="Missing 'text' key in candidates file"):
|
||||||
|
read_paired_jsonl(candidates_file, references_file)
|
||||||
233
tests/test_cli/test_validate.py
Normal file
233
tests/test_cli/test_validate.py
Normal file
@@ -0,0 +1,233 @@
|
|||||||
|
"""Tests for CLI validate command."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from typer.testing import CliRunner
|
||||||
|
|
||||||
|
from veritext.cli.main import app
|
||||||
|
|
||||||
|
runner = CliRunner()
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidateInline:
|
||||||
|
"""Tests for inline validation mode."""
|
||||||
|
|
||||||
|
def test_validate_inline_basic(self) -> None:
|
||||||
|
"""Test basic inline validation."""
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"validate",
|
||||||
|
"The quick brown fox jumps",
|
||||||
|
"-r",
|
||||||
|
"The quick brown fox jumps",
|
||||||
|
"-m",
|
||||||
|
"bleu",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "bleu4" in result.stdout
|
||||||
|
|
||||||
|
def test_validate_inline_with_rouge(self) -> None:
|
||||||
|
"""Test inline validation with ROUGE metric."""
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"validate",
|
||||||
|
"hello world today",
|
||||||
|
"-r",
|
||||||
|
"hello world here",
|
||||||
|
"-m",
|
||||||
|
"rouge",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "rouge_l" in result.stdout
|
||||||
|
|
||||||
|
def test_validate_inline_with_lexical(self) -> None:
|
||||||
|
"""Test inline validation with lexical metric."""
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"validate",
|
||||||
|
"hello world",
|
||||||
|
"-r",
|
||||||
|
"hello everyone",
|
||||||
|
"-m",
|
||||||
|
"lexical",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "jaccard" in result.stdout
|
||||||
|
assert "token_overlap" in result.stdout
|
||||||
|
|
||||||
|
def test_validate_inline_json_output(self) -> None:
|
||||||
|
"""Test inline validation with JSON output."""
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"validate",
|
||||||
|
"hello world today",
|
||||||
|
"-r",
|
||||||
|
"hello world today",
|
||||||
|
"-m",
|
||||||
|
"bleu",
|
||||||
|
"-o",
|
||||||
|
"json",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
data = json.loads(result.stdout)
|
||||||
|
assert "bleu4" in data
|
||||||
|
|
||||||
|
def test_validate_inline_simple_output(self) -> None:
|
||||||
|
"""Test inline validation with simple output."""
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"validate",
|
||||||
|
"hello world today",
|
||||||
|
"-r",
|
||||||
|
"hello world today",
|
||||||
|
"-m",
|
||||||
|
"rouge",
|
||||||
|
"-o",
|
||||||
|
"simple",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "rouge_l:" in result.stdout
|
||||||
|
|
||||||
|
def test_validate_inline_missing_reference(self) -> None:
|
||||||
|
"""Test inline validation without reference."""
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
["validate", "hello world", "-m", "bleu"],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "Error" in result.stdout
|
||||||
|
|
||||||
|
def test_validate_inline_invalid_metric(self) -> None:
|
||||||
|
"""Test inline validation with invalid metric."""
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
["validate", "hello", "-r", "world", "-m", "invalid_metric"],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "Unknown metrics" in result.stdout
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidateFile:
|
||||||
|
"""Tests for file-based validation mode."""
|
||||||
|
|
||||||
|
def test_validate_file_basic(self, tmp_path: Path) -> None:
|
||||||
|
"""Test basic file-based validation."""
|
||||||
|
data_file = tmp_path / "data.jsonl"
|
||||||
|
data_file.write_text(
|
||||||
|
'{"candidate": "hello world today", "reference": "hello world today"}\n'
|
||||||
|
'{"candidate": "foo bar baz", "reference": "foo bar baz"}'
|
||||||
|
)
|
||||||
|
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
["validate", "-f", str(data_file), "-m", "bleu"],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "bleu4" in result.stdout
|
||||||
|
assert "Evaluated 2 text pairs" in result.stdout
|
||||||
|
|
||||||
|
def test_validate_file_not_found(self) -> None:
|
||||||
|
"""Test file-based validation with non-existent file."""
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
["validate", "-f", "/nonexistent/file.jsonl", "-m", "bleu"],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "Error" in result.stdout
|
||||||
|
|
||||||
|
def test_validate_paired_files(self, tmp_path: Path) -> None:
|
||||||
|
"""Test validation with separate candidate and reference files."""
|
||||||
|
candidates_file = tmp_path / "candidates.jsonl"
|
||||||
|
references_file = tmp_path / "references.jsonl"
|
||||||
|
|
||||||
|
candidates_file.write_text(
|
||||||
|
'{"text": "hello world today"}\n{"text": "foo bar baz"}'
|
||||||
|
)
|
||||||
|
references_file.write_text(
|
||||||
|
'{"text": "hello world today"}\n{"text": "foo bar baz"}'
|
||||||
|
)
|
||||||
|
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"validate",
|
||||||
|
"-f",
|
||||||
|
str(candidates_file),
|
||||||
|
"-R",
|
||||||
|
str(references_file),
|
||||||
|
"-m",
|
||||||
|
"bleu",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "Evaluated 2 text pairs" in result.stdout
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidateOptions:
|
||||||
|
"""Tests for validate command options."""
|
||||||
|
|
||||||
|
def test_validate_with_threshold(self) -> None:
|
||||||
|
"""Test validation with threshold option."""
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"validate",
|
||||||
|
"hello world today",
|
||||||
|
"-r",
|
||||||
|
"hello world today",
|
||||||
|
"-m",
|
||||||
|
"bleu",
|
||||||
|
"-t",
|
||||||
|
"0.5",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
# Table output should include Status column
|
||||||
|
assert "Status" in result.stdout or "PASS" in result.stdout
|
||||||
|
|
||||||
|
def test_validate_invalid_output_format(self) -> None:
|
||||||
|
"""Test validation with invalid output format."""
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"validate",
|
||||||
|
"hello",
|
||||||
|
"-r",
|
||||||
|
"world",
|
||||||
|
"-m",
|
||||||
|
"bleu",
|
||||||
|
"-o",
|
||||||
|
"invalid",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "Invalid output format" in result.stdout
|
||||||
|
|
||||||
|
def test_validate_multiple_metrics(self) -> None:
|
||||||
|
"""Test validation with multiple metrics."""
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"validate",
|
||||||
|
"The quick brown fox",
|
||||||
|
"-r",
|
||||||
|
"The quick brown fox",
|
||||||
|
"-m",
|
||||||
|
"bleu,rouge,lexical",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "bleu4" in result.stdout
|
||||||
|
assert "rouge_l" in result.stdout
|
||||||
|
assert "jaccard" in result.stdout
|
||||||
Reference in New Issue
Block a user