8 Commits

Author SHA1 Message Date
d5df8b52e6 docs: add branch creation instruction to git workflow
Explicitly documents the requirement to create a new branch before starting
work from a plan, consistent with the parent workspace CLAUDE.md instruction.
2026-02-03 19:06:45 +00:00
8b7c087de7 docs(changelog): add CLI entries
Document command-line interface including validate command,
benchmark subcommands, and output formatting options.
2026-02-03 18:22:50 +00:00
c54f8c3f6f test(cli): add CLI tests
Add comprehensive test suite for validate command, benchmark commands,
input readers, and output formatters using Typer CliRunner.
2026-02-03 18:22:31 +00:00
0cadfd4d23 feat(cli): add benchmark subcommands
Add benchmark run, show, and check commands for quality tracking
with regression detection supporting CI integration.
2026-02-03 18:20:28 +00:00
e128720917 feat(cli): add validate command
Implement validate command with inline and file-based modes
supporting BLEU, ROUGE, and lexical metrics with multiple output formats.
2026-02-03 18:19:20 +00:00
f713d5e8a6 feat(cli): add Rich output formatters
Add formatters for validation results (table/json/simple) and
benchmark history display with regression report panels.
2026-02-03 18:17:33 +00:00
9853b57843 feat(cli): add JSONL and directory input readers
Add TextPair dataclass and read_jsonl/read_paired_jsonl functions
for parsing candidate-reference pairs from JSONL files.
2026-02-03 18:16:34 +00:00
55faae3e1b feat(cli): add CLI entry point with version command
Initialise Typer app with --version flag and help text.
2026-02-03 18:16:07 +00:00
13 changed files with 1582 additions and 0 deletions

View File

@@ -83,6 +83,11 @@ Each layer depends only on layers below it.
## Git Workflow ## Git Workflow
### Before Starting Work
When starting work from a plan, create a new branch matching the plan's scope before
making any changes. Do not reuse an existing branch from previous work, even if related.
### Commits ### Commits
- Format: `type(scope): description` - Format: `type(scope): description`

View File

@@ -45,3 +45,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `assert_no_regression()` raises `RegressionDetectedError` for CI integration - `assert_no_regression()` raises `RegressionDetectedError` for CI integration
- Customisable tolerance threshold and window size for regression detection - Customisable tolerance threshold and window size for regression detection
- Metadata support for tracking git SHA, model versions, etc. - Metadata support for tracking git SHA, model versions, etc.
- Command-line interface (CLI) via `veritext` command
- `veritext validate` command for inline and file-based text validation
- JSONL input format support for batch validation (`--file` option)
- Separate candidate/reference file support (`--reference-file` option)
- Multiple output formats: table (default), JSON, and simple text
- `veritext benchmark run` command for running evaluations and storing results
- `veritext benchmark show` command for viewing benchmark history
- `veritext benchmark check` command for regression detection with exit code 1 on failure
- Rich-formatted terminal output with tables and coloured panels

View File

@@ -0,0 +1,5 @@
"""CLI module: Command-line interface for Veritext."""
from veritext.cli.main import app
__all__ = ["app"]

View File

@@ -0,0 +1,166 @@
"""Benchmark commands for quality tracking."""
from pathlib import Path
from typing import Annotated
import typer
from veritext.benchmark import Benchmark
from veritext.cli.formatters import (
console,
format_benchmark_history,
format_regression_report,
)
from veritext.cli.readers import read_jsonl
benchmark_app = typer.Typer(
name="benchmark",
help="Track and compare text quality over time.",
no_args_is_help=True,
)
@benchmark_app.command("run")
def benchmark_run(
name: Annotated[
str,
typer.Argument(help="Name for this benchmark suite."),
],
file: Annotated[
Path,
typer.Option("--file", "-f", help="JSONL file with candidate/reference pairs."),
],
metrics: Annotated[
str,
typer.Option(
"--metrics",
"-m",
help="Comma-separated metrics to track (e.g., rouge_l,bleu4).",
),
] = "rouge_l,bleu4",
storage_path: Annotated[
Path,
typer.Option(
"--storage",
"-s",
help="Directory for benchmark data storage.",
),
] = Path("benchmarks"),
) -> None:
"""
Run a benchmark evaluation and store the results.
Example:
veritext benchmark run my_bench -f data.jsonl -m rouge_l,bleu4
"""
# Read text pairs
try:
pairs = read_jsonl(file)
except (FileNotFoundError, ValueError) as e:
console.print(f"[red]Error:[/red] {e}")
raise typer.Exit(code=1) from e
if not pairs:
console.print("[yellow]Warning:[/yellow] No text pairs found in file.")
raise typer.Exit(code=0)
# Parse metrics
metric_names = [m.strip() for m in metrics.split(",")]
candidates = [p.candidate for p in pairs]
references = [p.reference for p in pairs]
# Run benchmark
bench = Benchmark(name, storage_path=storage_path)
run = bench.evaluate(candidates, references, metrics=metric_names)
console.print(f"[green]Benchmark '{name}' completed.[/green]")
console.print(f"Samples: {run.sample_count}")
console.print("\nMetrics:")
for metric_name, value in sorted(run.metrics.items()):
console.print(f" {metric_name}: {value:.4f}")
@benchmark_app.command("show")
def benchmark_show(
name: Annotated[
str,
typer.Argument(help="Name of the benchmark suite."),
],
last: Annotated[
int,
typer.Option("--last", "-n", help="Number of recent runs to show."),
] = 20,
storage_path: Annotated[
Path,
typer.Option(
"--storage",
"-s",
help="Directory for benchmark data storage.",
),
] = Path("benchmarks"),
) -> None:
"""
Show benchmark history for a suite.
Example:
veritext benchmark show my_bench --last 10
"""
bench = Benchmark(name, storage_path=storage_path)
runs = bench.get_history(limit=last)
if not runs:
console.print(f"[yellow]No benchmark runs found for '{name}'.[/yellow]")
raise typer.Exit(code=0)
table = format_benchmark_history(runs)
console.print(table)
@benchmark_app.command("check")
def benchmark_check(
name: Annotated[
str,
typer.Argument(help="Name of the benchmark suite."),
],
tolerance: Annotated[
float,
typer.Option(
"--tolerance",
"-t",
help="Maximum allowed metric drop (e.g., 0.05 = 5%).",
),
] = 0.05,
window: Annotated[
int,
typer.Option(
"--window",
"-w",
help="Number of historical runs for baseline.",
),
] = 10,
storage_path: Annotated[
Path,
typer.Option(
"--storage",
"-s",
help="Directory for benchmark data storage.",
),
] = Path("benchmarks"),
) -> None:
"""
Check for quality regression against historical baseline.
Exits with code 1 if regression detected (for CI integration).
Example:
veritext benchmark check my_bench --tolerance 0.05
"""
bench = Benchmark(name, storage_path=storage_path)
report = bench.check_regression(tolerance=tolerance, window=window)
panel = format_regression_report(report)
console.print(panel)
if report.detected:
raise typer.Exit(code=1)

View File

@@ -0,0 +1,170 @@
"""Rich output formatters for CLI display."""
import json
from rich.console import Console
from rich.panel import Panel
from rich.table import Table
from veritext.benchmark.models import BenchmarkRun, RegressionReport
console = Console()
def format_validation_table(
results: dict[str, float],
threshold: float | None = None,
) -> Table:
"""
Format validation results as a Rich table.
Args:
results: Dictionary of metric names to scores.
threshold: Optional threshold for pass/fail colouring.
Returns:
Rich Table object.
"""
table = Table(title="Validation Results", show_header=True, header_style="bold")
table.add_column("Metric", style="cyan")
table.add_column("Score", justify="right")
if threshold is not None:
table.add_column("Status", justify="center")
for metric, score in sorted(results.items()):
score_str = f"{score:.4f}"
if threshold is not None:
status = "[green]PASS[/green]" if score >= threshold else "[red]FAIL[/red]"
table.add_row(metric, score_str, status)
else:
table.add_row(metric, score_str)
return table
def format_validation_json(results: dict[str, float]) -> str:
"""
Format validation results as JSON.
Args:
results: Dictionary of metric names to scores.
Returns:
JSON string.
"""
return json.dumps(results, indent=2)
def format_validation_simple(results: dict[str, float]) -> str:
"""
Format validation results as simple text output.
Args:
results: Dictionary of metric names to scores.
Returns:
Simple text string with one metric per line.
"""
lines = [f"{metric}: {score:.4f}" for metric, score in sorted(results.items())]
return "\n".join(lines)
def format_benchmark_history(runs: list[BenchmarkRun]) -> Table:
"""
Format benchmark run history as a Rich table.
Args:
runs: List of BenchmarkRun objects (most recent first).
Returns:
Rich Table object.
"""
if not runs:
table = Table(title="Benchmark History")
table.add_column("No runs found")
return table
# Get all metric names from the runs
metric_names: set[str] = set()
for run in runs:
metric_names.update(run.metrics.keys())
sorted_metrics = sorted(metric_names)
table = Table(title="Benchmark History", show_header=True, header_style="bold")
table.add_column("Timestamp", style="cyan")
table.add_column("Samples", justify="right")
for metric in sorted_metrics:
table.add_column(metric, justify="right")
for run in runs:
timestamp = run.timestamp.strftime("%Y-%m-%d %H:%M")
samples = str(run.sample_count)
metric_values = [f"{run.metrics.get(m, 0.0):.4f}" for m in sorted_metrics]
table.add_row(timestamp, samples, *metric_values)
return table
def format_regression_report(report: RegressionReport) -> Panel:
"""
Format a regression report as a Rich panel.
Args:
report: RegressionReport object.
Returns:
Rich Panel object with formatted report.
"""
if not report.detected:
content = (
f"[green]No regression detected.[/green]\nTolerance: {report.tolerance:.2%}"
)
return Panel(content, title="Regression Check", border_style="green")
# Build regression details
lines = [
"[red]Regression detected![/red]",
f"Tolerance: {report.tolerance:.2%}",
"",
"Metric details:",
]
for metric in sorted(report.deltas.keys()):
baseline = report.baseline.get(metric, 0.0)
current = report.current.get(metric, 0.0)
delta = report.deltas[metric]
if delta < -report.tolerance:
status = "[red]REGRESSED[/red]"
else:
status = "[green]OK[/green]"
lines.append(
f" {metric}: {current:.4f} (baseline: {baseline:.4f}, "
f"delta: {delta:+.4f}) {status}"
)
return Panel("\n".join(lines), title="Regression Check", border_style="red")
def print_validation_output(
results: dict[str, float],
output_format: str = "table",
threshold: float | None = None,
) -> None:
"""
Print validation results in the specified format.
Args:
results: Dictionary of metric names to scores.
output_format: Output format ('table', 'json', or 'simple').
threshold: Optional threshold for pass/fail colouring (table only).
"""
if output_format == "json":
console.print(format_validation_json(results))
elif output_format == "simple":
console.print(format_validation_simple(results))
else:
console.print(format_validation_table(results, threshold))

37
src/veritext/cli/main.py Normal file
View File

@@ -0,0 +1,37 @@
"""Veritext CLI entry point."""
import typer
import veritext
from veritext.cli.benchmark import benchmark_app
from veritext.cli.validate import validate
app = typer.Typer(
name="veritext",
help="Semantic text validation framework.",
no_args_is_help=True,
)
# Register commands
app.command()(validate)
app.add_typer(benchmark_app)
@app.callback(invoke_without_command=True)
def main(
version: bool | None = typer.Option(
None,
"--version",
"-V",
help="Show version and exit.",
is_eager=True,
),
) -> None:
"""Veritext: Semantic text validation framework for Python."""
if version:
typer.echo(f"veritext {veritext.__version__}")
raise typer.Exit()
if __name__ == "__main__":
app()

120
src/veritext/cli/readers.py Normal file
View File

@@ -0,0 +1,120 @@
"""Input readers for CLI operations."""
import json
from dataclasses import dataclass
from pathlib import Path
@dataclass
class TextPair:
"""A candidate-reference text pair for validation."""
candidate: str
reference: str
def read_jsonl(path: Path) -> list[TextPair]:
"""
Read text pairs from a JSONL file.
Each line must be a JSON object with 'candidate' and 'reference' keys.
Args:
path: Path to the JSONL file.
Returns:
List of TextPair objects.
Raises:
FileNotFoundError: If the file does not exist.
ValueError: If any line is malformed or missing required keys.
"""
if not path.exists():
raise FileNotFoundError(f"File not found: {path}")
pairs: list[TextPair] = []
with path.open() as f:
for line_num, line in enumerate(f, start=1):
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON on line {line_num}: {e}") from e
if "candidate" not in data:
raise ValueError(f"Missing 'candidate' key on line {line_num}")
if "reference" not in data:
raise ValueError(f"Missing 'reference' key on line {line_num}")
pairs.append(
TextPair(
candidate=str(data["candidate"]),
reference=str(data["reference"]),
)
)
return pairs
def read_paired_jsonl(candidates_path: Path, references_path: Path) -> list[TextPair]:
"""
Read text pairs from separate candidate and reference JSONL files.
Each file should contain one JSON object per line with a 'text' key.
Args:
candidates_path: Path to the candidates JSONL file.
references_path: Path to the references JSONL file.
Returns:
List of TextPair objects.
Raises:
FileNotFoundError: If either file does not exist.
ValueError: If files have different lengths or are malformed.
"""
candidates = _read_text_jsonl(candidates_path, "candidates")
references = _read_text_jsonl(references_path, "references")
if len(candidates) != len(references):
raise ValueError(
f"Number of candidates ({len(candidates)}) does not match "
f"number of references ({len(references)})"
)
return [
TextPair(candidate=c, reference=r)
for c, r in zip(candidates, references, strict=True)
]
def _read_text_jsonl(path: Path, label: str) -> list[str]:
"""Read text values from a JSONL file with 'text' key per line."""
if not path.exists():
raise FileNotFoundError(f"{label.capitalize()} file not found: {path}")
texts: list[str] = []
with path.open() as f:
for line_num, line in enumerate(f, start=1):
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
except json.JSONDecodeError as e:
raise ValueError(
f"Invalid JSON in {label} file on line {line_num}: {e}"
) from e
if "text" not in data:
raise ValueError(
f"Missing 'text' key in {label} file on line {line_num}"
)
texts.append(str(data["text"]))
return texts

View File

@@ -0,0 +1,213 @@
"""Validate command for computing text metrics."""
from pathlib import Path
from typing import Annotated
import typer
from veritext.cli.formatters import console, print_validation_output
from veritext.cli.readers import read_jsonl, read_paired_jsonl
from veritext.metrics.bleu import Bleu
from veritext.metrics.lexical import Lexical
from veritext.metrics.rouge import Rouge
# Available metrics mapped to their computation functions
AVAILABLE_METRICS = frozenset(
{"bleu", "bleu1", "bleu2", "bleu3", "bleu4", "rouge", "rouge_l", "lexical"}
)
def _compute_metrics(
candidate: str,
reference: str,
metric_names: list[str],
) -> dict[str, float]:
"""Compute requested metrics for a single text pair."""
results: dict[str, float] = {}
bleu = Bleu()
rouge = Rouge()
lexical = Lexical()
for metric in metric_names:
if metric == "bleu" or metric == "bleu4":
bleu_result = bleu.score(candidate, reference)
results["bleu4"] = bleu_result.bleu4
elif metric == "bleu1":
bleu_result = bleu.score(candidate, reference)
results["bleu1"] = bleu_result.bleu1
elif metric == "bleu2":
bleu_result = bleu.score(candidate, reference)
results["bleu2"] = bleu_result.bleu2
elif metric == "bleu3":
bleu_result = bleu.score(candidate, reference)
results["bleu3"] = bleu_result.bleu3
elif metric == "rouge" or metric == "rouge_l":
rouge_result = rouge.score(candidate, reference)
results["rouge_l"] = rouge_result.rouge_l.fmeasure
elif metric == "lexical":
lexical_result = lexical.score(candidate, reference)
results["jaccard"] = lexical_result.jaccard
results["token_overlap"] = lexical_result.token_overlap
return results
def _compute_batch_metrics(
candidates: list[str],
references: list[str],
metric_names: list[str],
) -> dict[str, float]:
"""Compute average metrics for a batch of text pairs."""
bleu = Bleu()
rouge = Rouge()
lexical = Lexical()
results: dict[str, float] = {}
for metric in metric_names:
if metric == "bleu" or metric == "bleu4":
bleu_batch = bleu.batch_score(candidates, references)
stats = bleu_batch.stats.get("bleu4")
if stats:
results["bleu4"] = stats.mean
elif metric == "bleu1":
bleu_batch = bleu.batch_score(candidates, references)
stats = bleu_batch.stats.get("bleu1")
if stats:
results["bleu1"] = stats.mean
elif metric == "bleu2":
bleu_batch = bleu.batch_score(candidates, references)
stats = bleu_batch.stats.get("bleu2")
if stats:
results["bleu2"] = stats.mean
elif metric == "bleu3":
bleu_batch = bleu.batch_score(candidates, references)
stats = bleu_batch.stats.get("bleu3")
if stats:
results["bleu3"] = stats.mean
elif metric == "rouge" or metric == "rouge_l":
rouge_batch = rouge.batch_score(candidates, references)
stats = rouge_batch.stats.get("rouge_l_fmeasure")
if stats:
results["rouge_l"] = stats.mean
elif metric == "lexical":
lexical_batch = lexical.batch_score(candidates, references)
jaccard_stats = lexical_batch.stats.get("jaccard")
overlap_stats = lexical_batch.stats.get("token_overlap")
if jaccard_stats:
results["jaccard"] = jaccard_stats.mean
if overlap_stats:
results["token_overlap"] = overlap_stats.mean
return results
def _parse_metrics(metrics_str: str) -> list[str]:
"""Parse comma-separated metric names."""
metrics = [m.strip().lower() for m in metrics_str.split(",")]
# Validate metric names
invalid = [m for m in metrics if m not in AVAILABLE_METRICS]
if invalid:
raise typer.BadParameter(
f"Unknown metrics: {', '.join(invalid)}. "
f"Available: {', '.join(sorted(AVAILABLE_METRICS))}"
)
return metrics
def validate(
text: Annotated[
str | None,
typer.Argument(help="Candidate text to validate (inline mode)."),
] = None,
reference: Annotated[
str | None,
typer.Option("--reference", "-r", help="Reference text for comparison."),
] = None,
file: Annotated[
Path | None,
typer.Option("--file", "-f", help="JSONL file with candidate/reference pairs."),
] = None,
reference_file: Annotated[
Path | None,
typer.Option(
"--reference-file",
"-R",
help="Separate JSONL file with references (requires --file).",
),
] = None,
metrics: Annotated[
str,
typer.Option(
"--metrics",
"-m",
help="Comma-separated metrics: bleu, bleu1-4, rouge, rouge_l, lexical.",
),
] = "bleu,rouge",
output: Annotated[
str,
typer.Option("--output", "-o", help="Output format: table, json, or simple."),
] = "table",
threshold: Annotated[
float | None,
typer.Option("--threshold", "-t", help="Score threshold for pass/fail status."),
] = None,
) -> None:
"""
Validate text quality using various metrics.
Use inline mode for single texts:
veritext validate "text" -r "reference" -m bleu,rouge
Use file mode for batches:
veritext validate -f outputs.jsonl -m bleu,rouge
"""
# Parse and validate metric names
try:
metric_names = _parse_metrics(metrics)
except typer.BadParameter as e:
console.print(f"[red]Error:[/red] {e}")
raise typer.Exit(code=1) from e
# Validate output format
if output not in ("table", "json", "simple"):
console.print(f"[red]Error:[/red] Invalid output format: {output}")
raise typer.Exit(code=1)
# Determine mode: inline vs file
if file is not None:
# File mode
try:
if reference_file is not None:
pairs = read_paired_jsonl(file, reference_file)
else:
pairs = read_jsonl(file)
except (FileNotFoundError, ValueError) as e:
console.print(f"[red]Error:[/red] {e}")
raise typer.Exit(code=1) from e
if not pairs:
console.print("[yellow]Warning:[/yellow] No text pairs found in file.")
raise typer.Exit(code=0)
candidates = [p.candidate for p in pairs]
references = [p.reference for p in pairs]
results = _compute_batch_metrics(candidates, references, metric_names)
console.print(f"[dim]Evaluated {len(pairs)} text pairs.[/dim]\n")
elif text is not None and reference is not None:
# Inline mode
results = _compute_metrics(text, reference, metric_names)
else:
# Invalid usage
console.print(
"[red]Error:[/red] Provide either text and --reference, "
"or --file for batch mode."
)
raise typer.Exit(code=1)
print_validation_output(results, output, threshold)

View File

@@ -0,0 +1 @@
"""CLI test suite."""

View File

@@ -0,0 +1,337 @@
"""Tests for CLI benchmark commands."""
from pathlib import Path
from typer.testing import CliRunner
from veritext.cli.main import app
runner = CliRunner()
class TestBenchmarkRun:
"""Tests for benchmark run command."""
def test_benchmark_run_basic(self, tmp_path: Path) -> None:
"""Test basic benchmark run."""
data_file = tmp_path / "data.jsonl"
data_file.write_text(
'{"candidate": "hello world today", "reference": "hello world today"}\n'
'{"candidate": "foo bar baz qux", "reference": "foo bar baz qux"}'
)
storage_path = tmp_path / "benchmarks"
result = runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(data_file),
"-m",
"rouge_l,bleu4",
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
assert "Benchmark 'test_bench' completed" in result.stdout
assert "Samples: 2" in result.stdout
assert "rouge_l:" in result.stdout
assert "bleu4:" in result.stdout
def test_benchmark_run_file_not_found(self, tmp_path: Path) -> None:
"""Test benchmark run with non-existent file."""
result = runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
"/nonexistent/file.jsonl",
"-s",
str(tmp_path / "benchmarks"),
],
)
assert result.exit_code == 1
assert "Error" in result.stdout
def test_benchmark_run_creates_storage(self, tmp_path: Path) -> None:
"""Test that benchmark run creates storage directory."""
data_file = tmp_path / "data.jsonl"
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
storage_path = tmp_path / "new_benchmarks"
result = runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(data_file),
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
assert storage_path.exists()
class TestBenchmarkShow:
"""Tests for benchmark show command."""
def test_benchmark_show_no_runs(self, tmp_path: Path) -> None:
"""Test showing benchmark with no runs."""
storage_path = tmp_path / "benchmarks"
storage_path.mkdir()
result = runner.invoke(
app,
[
"benchmark",
"show",
"nonexistent_bench",
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
assert "No benchmark runs found" in result.stdout
def test_benchmark_show_with_runs(self, tmp_path: Path) -> None:
"""Test showing benchmark history with runs."""
# First create some runs
data_file = tmp_path / "data.jsonl"
data_file.write_text('{"candidate": "hello world", "reference": "hello world"}')
storage_path = tmp_path / "benchmarks"
# Run benchmark twice
for _ in range(2):
runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(data_file),
"-s",
str(storage_path),
],
)
# Show history
result = runner.invoke(
app,
[
"benchmark",
"show",
"test_bench",
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
assert "Benchmark History" in result.stdout
def test_benchmark_show_limit(self, tmp_path: Path) -> None:
"""Test showing limited benchmark history."""
data_file = tmp_path / "data.jsonl"
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
storage_path = tmp_path / "benchmarks"
# Run benchmark 3 times
for _ in range(3):
runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(data_file),
"-s",
str(storage_path),
],
)
# Show only last 2
result = runner.invoke(
app,
[
"benchmark",
"show",
"test_bench",
"--last",
"2",
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
class TestBenchmarkCheck:
"""Tests for benchmark check command."""
def test_benchmark_check_no_regression(self, tmp_path: Path) -> None:
"""Test checking for regression with no regression."""
data_file = tmp_path / "data.jsonl"
data_file.write_text(
'{"candidate": "hello world today", "reference": "hello world today"}'
)
storage_path = tmp_path / "benchmarks"
# Run benchmark twice with same data (no regression)
for _ in range(2):
runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(data_file),
"-s",
str(storage_path),
],
)
# Check for regression
result = runner.invoke(
app,
[
"benchmark",
"check",
"test_bench",
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
assert "No regression detected" in result.stdout
def test_benchmark_check_with_regression(self, tmp_path: Path) -> None:
"""Test checking for regression when regression occurs."""
storage_path = tmp_path / "benchmarks"
# First run with good data
good_file = tmp_path / "good.jsonl"
good_file.write_text(
'{"candidate": "hello world today", "reference": "hello world today"}'
)
runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(good_file),
"-s",
str(storage_path),
],
)
# Second run with bad data (regression)
bad_file = tmp_path / "bad.jsonl"
bad_file.write_text(
'{"candidate": "completely different", "reference": "hello world today"}'
)
runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(bad_file),
"-s",
str(storage_path),
],
)
# Check for regression
result = runner.invoke(
app,
[
"benchmark",
"check",
"test_bench",
"-t",
"0.05",
"-s",
str(storage_path),
],
)
assert result.exit_code == 1
assert "Regression detected" in result.stdout
def test_benchmark_check_custom_tolerance(self, tmp_path: Path) -> None:
"""Test checking regression with custom tolerance."""
data_file = tmp_path / "data.jsonl"
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
storage_path = tmp_path / "benchmarks"
runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(data_file),
"-s",
str(storage_path),
],
)
result = runner.invoke(
app,
[
"benchmark",
"check",
"test_bench",
"--tolerance",
"0.10",
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
assert "10.00%" in result.stdout
class TestBenchmarkHelp:
"""Tests for benchmark help output."""
def test_benchmark_help(self) -> None:
"""Test benchmark help output."""
result = runner.invoke(app, ["benchmark", "--help"])
assert result.exit_code == 0
assert "run" in result.stdout
assert "show" in result.stdout
assert "check" in result.stdout
def test_benchmark_run_help(self) -> None:
"""Test benchmark run help output."""
result = runner.invoke(app, ["benchmark", "run", "--help"])
assert result.exit_code == 0
assert "--file" in result.stdout
assert "--metrics" in result.stdout
def test_benchmark_show_help(self) -> None:
"""Test benchmark show help output."""
result = runner.invoke(app, ["benchmark", "show", "--help"])
assert result.exit_code == 0
assert "--last" in result.stdout
def test_benchmark_check_help(self) -> None:
"""Test benchmark check help output."""
result = runner.invoke(app, ["benchmark", "check", "--help"])
assert result.exit_code == 0
assert "--tolerance" in result.stdout
assert "--window" in result.stdout

View File

@@ -0,0 +1,141 @@
"""Tests for CLI output formatters."""
from datetime import UTC, datetime
from veritext.benchmark.models import BenchmarkRun, RegressionReport
from veritext.cli.formatters import (
format_benchmark_history,
format_regression_report,
format_validation_json,
format_validation_simple,
format_validation_table,
)
class TestFormatValidationTable:
"""Tests for format_validation_table function."""
def test_format_empty_results(self) -> None:
"""Test formatting empty results."""
table = format_validation_table({})
assert table.title == "Validation Results"
assert table.row_count == 0
def test_format_single_metric(self) -> None:
"""Test formatting a single metric."""
results = {"bleu4": 0.8523}
table = format_validation_table(results)
assert table.row_count == 1
def test_format_multiple_metrics(self) -> None:
"""Test formatting multiple metrics."""
results = {"bleu4": 0.85, "rouge_l": 0.92, "jaccard": 0.75}
table = format_validation_table(results)
assert table.row_count == 3
def test_format_with_threshold(self) -> None:
"""Test formatting with threshold for pass/fail."""
results = {"bleu4": 0.85, "rouge_l": 0.45}
table = format_validation_table(results, threshold=0.5)
# Should have 3 columns: Metric, Score, Status
assert table.row_count == 2
class TestFormatValidationJson:
"""Tests for format_validation_json function."""
def test_format_empty_results(self) -> None:
"""Test formatting empty results as JSON."""
result = format_validation_json({})
assert result == "{}"
def test_format_results(self) -> None:
"""Test formatting results as JSON."""
results = {"bleu4": 0.85, "rouge_l": 0.92}
result = format_validation_json(results)
assert '"bleu4": 0.85' in result
assert '"rouge_l": 0.92' in result
class TestFormatValidationSimple:
"""Tests for format_validation_simple function."""
def test_format_empty_results(self) -> None:
"""Test formatting empty results as simple text."""
result = format_validation_simple({})
assert result == ""
def test_format_results(self) -> None:
"""Test formatting results as simple text."""
results = {"bleu4": 0.8523, "rouge_l": 0.9234}
result = format_validation_simple(results)
assert "bleu4: 0.8523" in result
assert "rouge_l: 0.9234" in result
class TestFormatBenchmarkHistory:
"""Tests for format_benchmark_history function."""
def test_format_empty_history(self) -> None:
"""Test formatting empty benchmark history."""
table = format_benchmark_history([])
assert table.title == "Benchmark History"
def test_format_single_run(self) -> None:
"""Test formatting a single benchmark run."""
run = BenchmarkRun(
id="test-id",
benchmark_name="test",
timestamp=datetime(2024, 1, 15, 10, 30, tzinfo=UTC),
veritext_version="0.1.0",
metrics={"rouge_l": 0.85, "bleu4": 0.72},
sample_count=100,
)
table = format_benchmark_history([run])
assert table.row_count == 1
def test_format_multiple_runs(self) -> None:
"""Test formatting multiple benchmark runs."""
runs = [
BenchmarkRun(
id=f"test-id-{i}",
benchmark_name="test",
timestamp=datetime(2024, 1, i + 1, 10, 30, tzinfo=UTC),
veritext_version="0.1.0",
metrics={"rouge_l": 0.8 + i * 0.01},
sample_count=100,
)
for i in range(3)
]
table = format_benchmark_history(runs)
assert table.row_count == 3
class TestFormatRegressionReport:
"""Tests for format_regression_report function."""
def test_format_no_regression(self) -> None:
"""Test formatting report with no regression."""
report = RegressionReport(
detected=False,
baseline={"rouge_l": 0.85},
current={"rouge_l": 0.86},
deltas={"rouge_l": 0.01},
tolerance=0.05,
)
panel = format_regression_report(report)
assert panel.title == "Regression Check"
assert panel.border_style == "green"
def test_format_with_regression(self) -> None:
"""Test formatting report with regression detected."""
report = RegressionReport(
detected=True,
baseline={"rouge_l": 0.85, "bleu4": 0.72},
current={"rouge_l": 0.70, "bleu4": 0.70},
deltas={"rouge_l": -0.15, "bleu4": -0.02},
tolerance=0.05,
)
panel = format_regression_report(report)
assert panel.title == "Regression Check"
assert panel.border_style == "red"

View File

@@ -0,0 +1,145 @@
"""Tests for CLI input readers."""
import json
from pathlib import Path
import pytest
from veritext.cli.readers import TextPair, read_jsonl, read_paired_jsonl
class TestTextPair:
"""Tests for TextPair dataclass."""
def test_create_text_pair(self) -> None:
"""Test creating a TextPair."""
pair = TextPair(candidate="hello", reference="world")
assert pair.candidate == "hello"
assert pair.reference == "world"
class TestReadJsonl:
"""Tests for read_jsonl function."""
def test_read_valid_jsonl(self, tmp_path: Path) -> None:
"""Test reading a valid JSONL file."""
data = [
{"candidate": "foo", "reference": "bar"},
{"candidate": "baz", "reference": "qux"},
]
jsonl_file = tmp_path / "data.jsonl"
jsonl_file.write_text("\n".join(json.dumps(d) for d in data))
pairs = read_jsonl(jsonl_file)
assert len(pairs) == 2
assert pairs[0].candidate == "foo"
assert pairs[0].reference == "bar"
assert pairs[1].candidate == "baz"
assert pairs[1].reference == "qux"
def test_read_empty_file(self, tmp_path: Path) -> None:
"""Test reading an empty JSONL file."""
jsonl_file = tmp_path / "empty.jsonl"
jsonl_file.write_text("")
pairs = read_jsonl(jsonl_file)
assert pairs == []
def test_read_file_with_blank_lines(self, tmp_path: Path) -> None:
"""Test reading a JSONL file with blank lines."""
jsonl_file = tmp_path / "data.jsonl"
content = '{"candidate": "a", "reference": "b"}\n\n{"candidate": "c", "reference": "d"}\n'
jsonl_file.write_text(content)
pairs = read_jsonl(jsonl_file)
assert len(pairs) == 2
def test_read_file_not_found(self, tmp_path: Path) -> None:
"""Test reading a non-existent file."""
with pytest.raises(FileNotFoundError):
read_jsonl(tmp_path / "nonexistent.jsonl")
def test_read_invalid_json(self, tmp_path: Path) -> None:
"""Test reading a file with invalid JSON."""
jsonl_file = tmp_path / "invalid.jsonl"
jsonl_file.write_text("not valid json")
with pytest.raises(ValueError, match="Invalid JSON on line 1"):
read_jsonl(jsonl_file)
def test_read_missing_candidate_key(self, tmp_path: Path) -> None:
"""Test reading a file missing the candidate key."""
jsonl_file = tmp_path / "data.jsonl"
jsonl_file.write_text('{"reference": "bar"}')
with pytest.raises(ValueError, match="Missing 'candidate' key on line 1"):
read_jsonl(jsonl_file)
def test_read_missing_reference_key(self, tmp_path: Path) -> None:
"""Test reading a file missing the reference key."""
jsonl_file = tmp_path / "data.jsonl"
jsonl_file.write_text('{"candidate": "foo"}')
with pytest.raises(ValueError, match="Missing 'reference' key on line 1"):
read_jsonl(jsonl_file)
class TestReadPairedJsonl:
"""Tests for read_paired_jsonl function."""
def test_read_paired_valid(self, tmp_path: Path) -> None:
"""Test reading valid paired JSONL files."""
candidates_file = tmp_path / "candidates.jsonl"
references_file = tmp_path / "references.jsonl"
candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}')
references_file.write_text('{"text": "baz"}\n{"text": "qux"}')
pairs = read_paired_jsonl(candidates_file, references_file)
assert len(pairs) == 2
assert pairs[0].candidate == "foo"
assert pairs[0].reference == "baz"
assert pairs[1].candidate == "bar"
assert pairs[1].reference == "qux"
def test_read_paired_length_mismatch(self, tmp_path: Path) -> None:
"""Test reading paired files with different lengths."""
candidates_file = tmp_path / "candidates.jsonl"
references_file = tmp_path / "references.jsonl"
candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}')
references_file.write_text('{"text": "baz"}')
with pytest.raises(ValueError, match="does not match"):
read_paired_jsonl(candidates_file, references_file)
def test_read_paired_candidates_not_found(self, tmp_path: Path) -> None:
"""Test reading when candidates file doesn't exist."""
references_file = tmp_path / "references.jsonl"
references_file.write_text('{"text": "baz"}')
with pytest.raises(FileNotFoundError, match="Candidates file not found"):
read_paired_jsonl(tmp_path / "nonexistent.jsonl", references_file)
def test_read_paired_references_not_found(self, tmp_path: Path) -> None:
"""Test reading when references file doesn't exist."""
candidates_file = tmp_path / "candidates.jsonl"
candidates_file.write_text('{"text": "foo"}')
with pytest.raises(FileNotFoundError, match="References file not found"):
read_paired_jsonl(candidates_file, tmp_path / "nonexistent.jsonl")
def test_read_paired_missing_text_key(self, tmp_path: Path) -> None:
"""Test reading paired files with missing text key."""
candidates_file = tmp_path / "candidates.jsonl"
references_file = tmp_path / "references.jsonl"
candidates_file.write_text('{"value": "foo"}')
references_file.write_text('{"text": "baz"}')
with pytest.raises(ValueError, match="Missing 'text' key in candidates file"):
read_paired_jsonl(candidates_file, references_file)

View File

@@ -0,0 +1,233 @@
"""Tests for CLI validate command."""
import json
from pathlib import Path
from typer.testing import CliRunner
from veritext.cli.main import app
runner = CliRunner()
class TestValidateInline:
"""Tests for inline validation mode."""
def test_validate_inline_basic(self) -> None:
"""Test basic inline validation."""
result = runner.invoke(
app,
[
"validate",
"The quick brown fox jumps",
"-r",
"The quick brown fox jumps",
"-m",
"bleu",
],
)
assert result.exit_code == 0
assert "bleu4" in result.stdout
def test_validate_inline_with_rouge(self) -> None:
"""Test inline validation with ROUGE metric."""
result = runner.invoke(
app,
[
"validate",
"hello world today",
"-r",
"hello world here",
"-m",
"rouge",
],
)
assert result.exit_code == 0
assert "rouge_l" in result.stdout
def test_validate_inline_with_lexical(self) -> None:
"""Test inline validation with lexical metric."""
result = runner.invoke(
app,
[
"validate",
"hello world",
"-r",
"hello everyone",
"-m",
"lexical",
],
)
assert result.exit_code == 0
assert "jaccard" in result.stdout
assert "token_overlap" in result.stdout
def test_validate_inline_json_output(self) -> None:
"""Test inline validation with JSON output."""
result = runner.invoke(
app,
[
"validate",
"hello world today",
"-r",
"hello world today",
"-m",
"bleu",
"-o",
"json",
],
)
assert result.exit_code == 0
data = json.loads(result.stdout)
assert "bleu4" in data
def test_validate_inline_simple_output(self) -> None:
"""Test inline validation with simple output."""
result = runner.invoke(
app,
[
"validate",
"hello world today",
"-r",
"hello world today",
"-m",
"rouge",
"-o",
"simple",
],
)
assert result.exit_code == 0
assert "rouge_l:" in result.stdout
def test_validate_inline_missing_reference(self) -> None:
"""Test inline validation without reference."""
result = runner.invoke(
app,
["validate", "hello world", "-m", "bleu"],
)
assert result.exit_code == 1
assert "Error" in result.stdout
def test_validate_inline_invalid_metric(self) -> None:
"""Test inline validation with invalid metric."""
result = runner.invoke(
app,
["validate", "hello", "-r", "world", "-m", "invalid_metric"],
)
assert result.exit_code == 1
assert "Unknown metrics" in result.stdout
class TestValidateFile:
"""Tests for file-based validation mode."""
def test_validate_file_basic(self, tmp_path: Path) -> None:
"""Test basic file-based validation."""
data_file = tmp_path / "data.jsonl"
data_file.write_text(
'{"candidate": "hello world today", "reference": "hello world today"}\n'
'{"candidate": "foo bar baz", "reference": "foo bar baz"}'
)
result = runner.invoke(
app,
["validate", "-f", str(data_file), "-m", "bleu"],
)
assert result.exit_code == 0
assert "bleu4" in result.stdout
assert "Evaluated 2 text pairs" in result.stdout
def test_validate_file_not_found(self) -> None:
"""Test file-based validation with non-existent file."""
result = runner.invoke(
app,
["validate", "-f", "/nonexistent/file.jsonl", "-m", "bleu"],
)
assert result.exit_code == 1
assert "Error" in result.stdout
def test_validate_paired_files(self, tmp_path: Path) -> None:
"""Test validation with separate candidate and reference files."""
candidates_file = tmp_path / "candidates.jsonl"
references_file = tmp_path / "references.jsonl"
candidates_file.write_text(
'{"text": "hello world today"}\n{"text": "foo bar baz"}'
)
references_file.write_text(
'{"text": "hello world today"}\n{"text": "foo bar baz"}'
)
result = runner.invoke(
app,
[
"validate",
"-f",
str(candidates_file),
"-R",
str(references_file),
"-m",
"bleu",
],
)
assert result.exit_code == 0
assert "Evaluated 2 text pairs" in result.stdout
class TestValidateOptions:
"""Tests for validate command options."""
def test_validate_with_threshold(self) -> None:
"""Test validation with threshold option."""
result = runner.invoke(
app,
[
"validate",
"hello world today",
"-r",
"hello world today",
"-m",
"bleu",
"-t",
"0.5",
],
)
assert result.exit_code == 0
# Table output should include Status column
assert "Status" in result.stdout or "PASS" in result.stdout
def test_validate_invalid_output_format(self) -> None:
"""Test validation with invalid output format."""
result = runner.invoke(
app,
[
"validate",
"hello",
"-r",
"world",
"-m",
"bleu",
"-o",
"invalid",
],
)
assert result.exit_code == 1
assert "Invalid output format" in result.stdout
def test_validate_multiple_metrics(self) -> None:
"""Test validation with multiple metrics."""
result = runner.invoke(
app,
[
"validate",
"The quick brown fox",
"-r",
"The quick brown fox",
"-m",
"bleu,rouge,lexical",
],
)
assert result.exit_code == 0
assert "bleu4" in result.stdout
assert "rouge_l" in result.stdout
assert "jaccard" in result.stdout