Compare commits
8 Commits
feat/bench
...
feat/cli
| Author | SHA1 | Date | |
|---|---|---|---|
|
d5df8b52e6
|
|||
|
8b7c087de7
|
|||
|
c54f8c3f6f
|
|||
|
0cadfd4d23
|
|||
|
e128720917
|
|||
|
f713d5e8a6
|
|||
|
9853b57843
|
|||
|
55faae3e1b
|
@@ -83,6 +83,11 @@ Each layer depends only on layers below it.
|
||||
|
||||
## Git Workflow
|
||||
|
||||
### Before Starting Work
|
||||
|
||||
When starting work from a plan, create a new branch matching the plan's scope before
|
||||
making any changes. Do not reuse an existing branch from previous work, even if related.
|
||||
|
||||
### Commits
|
||||
|
||||
- Format: `type(scope): description`
|
||||
|
||||
@@ -45,3 +45,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- `assert_no_regression()` raises `RegressionDetectedError` for CI integration
|
||||
- Customisable tolerance threshold and window size for regression detection
|
||||
- Metadata support for tracking git SHA, model versions, etc.
|
||||
- Command-line interface (CLI) via `veritext` command
|
||||
- `veritext validate` command for inline and file-based text validation
|
||||
- JSONL input format support for batch validation (`--file` option)
|
||||
- Separate candidate/reference file support (`--reference-file` option)
|
||||
- Multiple output formats: table (default), JSON, and simple text
|
||||
- `veritext benchmark run` command for running evaluations and storing results
|
||||
- `veritext benchmark show` command for viewing benchmark history
|
||||
- `veritext benchmark check` command for regression detection with exit code 1 on failure
|
||||
- Rich-formatted terminal output with tables and coloured panels
|
||||
|
||||
5
src/veritext/cli/__init__.py
Normal file
5
src/veritext/cli/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""CLI module: Command-line interface for Veritext."""
|
||||
|
||||
from veritext.cli.main import app
|
||||
|
||||
__all__ = ["app"]
|
||||
166
src/veritext/cli/benchmark.py
Normal file
166
src/veritext/cli/benchmark.py
Normal file
@@ -0,0 +1,166 @@
|
||||
"""Benchmark commands for quality tracking."""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Annotated
|
||||
|
||||
import typer
|
||||
|
||||
from veritext.benchmark import Benchmark
|
||||
from veritext.cli.formatters import (
|
||||
console,
|
||||
format_benchmark_history,
|
||||
format_regression_report,
|
||||
)
|
||||
from veritext.cli.readers import read_jsonl
|
||||
|
||||
benchmark_app = typer.Typer(
|
||||
name="benchmark",
|
||||
help="Track and compare text quality over time.",
|
||||
no_args_is_help=True,
|
||||
)
|
||||
|
||||
|
||||
@benchmark_app.command("run")
|
||||
def benchmark_run(
|
||||
name: Annotated[
|
||||
str,
|
||||
typer.Argument(help="Name for this benchmark suite."),
|
||||
],
|
||||
file: Annotated[
|
||||
Path,
|
||||
typer.Option("--file", "-f", help="JSONL file with candidate/reference pairs."),
|
||||
],
|
||||
metrics: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
"--metrics",
|
||||
"-m",
|
||||
help="Comma-separated metrics to track (e.g., rouge_l,bleu4).",
|
||||
),
|
||||
] = "rouge_l,bleu4",
|
||||
storage_path: Annotated[
|
||||
Path,
|
||||
typer.Option(
|
||||
"--storage",
|
||||
"-s",
|
||||
help="Directory for benchmark data storage.",
|
||||
),
|
||||
] = Path("benchmarks"),
|
||||
) -> None:
|
||||
"""
|
||||
Run a benchmark evaluation and store the results.
|
||||
|
||||
Example:
|
||||
veritext benchmark run my_bench -f data.jsonl -m rouge_l,bleu4
|
||||
"""
|
||||
# Read text pairs
|
||||
try:
|
||||
pairs = read_jsonl(file)
|
||||
except (FileNotFoundError, ValueError) as e:
|
||||
console.print(f"[red]Error:[/red] {e}")
|
||||
raise typer.Exit(code=1) from e
|
||||
|
||||
if not pairs:
|
||||
console.print("[yellow]Warning:[/yellow] No text pairs found in file.")
|
||||
raise typer.Exit(code=0)
|
||||
|
||||
# Parse metrics
|
||||
metric_names = [m.strip() for m in metrics.split(",")]
|
||||
|
||||
candidates = [p.candidate for p in pairs]
|
||||
references = [p.reference for p in pairs]
|
||||
|
||||
# Run benchmark
|
||||
bench = Benchmark(name, storage_path=storage_path)
|
||||
run = bench.evaluate(candidates, references, metrics=metric_names)
|
||||
|
||||
console.print(f"[green]Benchmark '{name}' completed.[/green]")
|
||||
console.print(f"Samples: {run.sample_count}")
|
||||
console.print("\nMetrics:")
|
||||
for metric_name, value in sorted(run.metrics.items()):
|
||||
console.print(f" {metric_name}: {value:.4f}")
|
||||
|
||||
|
||||
@benchmark_app.command("show")
|
||||
def benchmark_show(
|
||||
name: Annotated[
|
||||
str,
|
||||
typer.Argument(help="Name of the benchmark suite."),
|
||||
],
|
||||
last: Annotated[
|
||||
int,
|
||||
typer.Option("--last", "-n", help="Number of recent runs to show."),
|
||||
] = 20,
|
||||
storage_path: Annotated[
|
||||
Path,
|
||||
typer.Option(
|
||||
"--storage",
|
||||
"-s",
|
||||
help="Directory for benchmark data storage.",
|
||||
),
|
||||
] = Path("benchmarks"),
|
||||
) -> None:
|
||||
"""
|
||||
Show benchmark history for a suite.
|
||||
|
||||
Example:
|
||||
veritext benchmark show my_bench --last 10
|
||||
"""
|
||||
bench = Benchmark(name, storage_path=storage_path)
|
||||
runs = bench.get_history(limit=last)
|
||||
|
||||
if not runs:
|
||||
console.print(f"[yellow]No benchmark runs found for '{name}'.[/yellow]")
|
||||
raise typer.Exit(code=0)
|
||||
|
||||
table = format_benchmark_history(runs)
|
||||
console.print(table)
|
||||
|
||||
|
||||
@benchmark_app.command("check")
|
||||
def benchmark_check(
|
||||
name: Annotated[
|
||||
str,
|
||||
typer.Argument(help="Name of the benchmark suite."),
|
||||
],
|
||||
tolerance: Annotated[
|
||||
float,
|
||||
typer.Option(
|
||||
"--tolerance",
|
||||
"-t",
|
||||
help="Maximum allowed metric drop (e.g., 0.05 = 5%).",
|
||||
),
|
||||
] = 0.05,
|
||||
window: Annotated[
|
||||
int,
|
||||
typer.Option(
|
||||
"--window",
|
||||
"-w",
|
||||
help="Number of historical runs for baseline.",
|
||||
),
|
||||
] = 10,
|
||||
storage_path: Annotated[
|
||||
Path,
|
||||
typer.Option(
|
||||
"--storage",
|
||||
"-s",
|
||||
help="Directory for benchmark data storage.",
|
||||
),
|
||||
] = Path("benchmarks"),
|
||||
) -> None:
|
||||
"""
|
||||
Check for quality regression against historical baseline.
|
||||
|
||||
Exits with code 1 if regression detected (for CI integration).
|
||||
|
||||
Example:
|
||||
veritext benchmark check my_bench --tolerance 0.05
|
||||
"""
|
||||
bench = Benchmark(name, storage_path=storage_path)
|
||||
report = bench.check_regression(tolerance=tolerance, window=window)
|
||||
|
||||
panel = format_regression_report(report)
|
||||
console.print(panel)
|
||||
|
||||
if report.detected:
|
||||
raise typer.Exit(code=1)
|
||||
170
src/veritext/cli/formatters.py
Normal file
170
src/veritext/cli/formatters.py
Normal file
@@ -0,0 +1,170 @@
|
||||
"""Rich output formatters for CLI display."""
|
||||
|
||||
import json
|
||||
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
|
||||
from veritext.benchmark.models import BenchmarkRun, RegressionReport
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
def format_validation_table(
|
||||
results: dict[str, float],
|
||||
threshold: float | None = None,
|
||||
) -> Table:
|
||||
"""
|
||||
Format validation results as a Rich table.
|
||||
|
||||
Args:
|
||||
results: Dictionary of metric names to scores.
|
||||
threshold: Optional threshold for pass/fail colouring.
|
||||
|
||||
Returns:
|
||||
Rich Table object.
|
||||
"""
|
||||
table = Table(title="Validation Results", show_header=True, header_style="bold")
|
||||
table.add_column("Metric", style="cyan")
|
||||
table.add_column("Score", justify="right")
|
||||
|
||||
if threshold is not None:
|
||||
table.add_column("Status", justify="center")
|
||||
|
||||
for metric, score in sorted(results.items()):
|
||||
score_str = f"{score:.4f}"
|
||||
|
||||
if threshold is not None:
|
||||
status = "[green]PASS[/green]" if score >= threshold else "[red]FAIL[/red]"
|
||||
table.add_row(metric, score_str, status)
|
||||
else:
|
||||
table.add_row(metric, score_str)
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def format_validation_json(results: dict[str, float]) -> str:
|
||||
"""
|
||||
Format validation results as JSON.
|
||||
|
||||
Args:
|
||||
results: Dictionary of metric names to scores.
|
||||
|
||||
Returns:
|
||||
JSON string.
|
||||
"""
|
||||
return json.dumps(results, indent=2)
|
||||
|
||||
|
||||
def format_validation_simple(results: dict[str, float]) -> str:
|
||||
"""
|
||||
Format validation results as simple text output.
|
||||
|
||||
Args:
|
||||
results: Dictionary of metric names to scores.
|
||||
|
||||
Returns:
|
||||
Simple text string with one metric per line.
|
||||
"""
|
||||
lines = [f"{metric}: {score:.4f}" for metric, score in sorted(results.items())]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def format_benchmark_history(runs: list[BenchmarkRun]) -> Table:
|
||||
"""
|
||||
Format benchmark run history as a Rich table.
|
||||
|
||||
Args:
|
||||
runs: List of BenchmarkRun objects (most recent first).
|
||||
|
||||
Returns:
|
||||
Rich Table object.
|
||||
"""
|
||||
if not runs:
|
||||
table = Table(title="Benchmark History")
|
||||
table.add_column("No runs found")
|
||||
return table
|
||||
|
||||
# Get all metric names from the runs
|
||||
metric_names: set[str] = set()
|
||||
for run in runs:
|
||||
metric_names.update(run.metrics.keys())
|
||||
sorted_metrics = sorted(metric_names)
|
||||
|
||||
table = Table(title="Benchmark History", show_header=True, header_style="bold")
|
||||
table.add_column("Timestamp", style="cyan")
|
||||
table.add_column("Samples", justify="right")
|
||||
for metric in sorted_metrics:
|
||||
table.add_column(metric, justify="right")
|
||||
|
||||
for run in runs:
|
||||
timestamp = run.timestamp.strftime("%Y-%m-%d %H:%M")
|
||||
samples = str(run.sample_count)
|
||||
metric_values = [f"{run.metrics.get(m, 0.0):.4f}" for m in sorted_metrics]
|
||||
table.add_row(timestamp, samples, *metric_values)
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def format_regression_report(report: RegressionReport) -> Panel:
|
||||
"""
|
||||
Format a regression report as a Rich panel.
|
||||
|
||||
Args:
|
||||
report: RegressionReport object.
|
||||
|
||||
Returns:
|
||||
Rich Panel object with formatted report.
|
||||
"""
|
||||
if not report.detected:
|
||||
content = (
|
||||
f"[green]No regression detected.[/green]\nTolerance: {report.tolerance:.2%}"
|
||||
)
|
||||
return Panel(content, title="Regression Check", border_style="green")
|
||||
|
||||
# Build regression details
|
||||
lines = [
|
||||
"[red]Regression detected![/red]",
|
||||
f"Tolerance: {report.tolerance:.2%}",
|
||||
"",
|
||||
"Metric details:",
|
||||
]
|
||||
|
||||
for metric in sorted(report.deltas.keys()):
|
||||
baseline = report.baseline.get(metric, 0.0)
|
||||
current = report.current.get(metric, 0.0)
|
||||
delta = report.deltas[metric]
|
||||
|
||||
if delta < -report.tolerance:
|
||||
status = "[red]REGRESSED[/red]"
|
||||
else:
|
||||
status = "[green]OK[/green]"
|
||||
|
||||
lines.append(
|
||||
f" {metric}: {current:.4f} (baseline: {baseline:.4f}, "
|
||||
f"delta: {delta:+.4f}) {status}"
|
||||
)
|
||||
|
||||
return Panel("\n".join(lines), title="Regression Check", border_style="red")
|
||||
|
||||
|
||||
def print_validation_output(
|
||||
results: dict[str, float],
|
||||
output_format: str = "table",
|
||||
threshold: float | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Print validation results in the specified format.
|
||||
|
||||
Args:
|
||||
results: Dictionary of metric names to scores.
|
||||
output_format: Output format ('table', 'json', or 'simple').
|
||||
threshold: Optional threshold for pass/fail colouring (table only).
|
||||
"""
|
||||
if output_format == "json":
|
||||
console.print(format_validation_json(results))
|
||||
elif output_format == "simple":
|
||||
console.print(format_validation_simple(results))
|
||||
else:
|
||||
console.print(format_validation_table(results, threshold))
|
||||
37
src/veritext/cli/main.py
Normal file
37
src/veritext/cli/main.py
Normal file
@@ -0,0 +1,37 @@
|
||||
"""Veritext CLI entry point."""
|
||||
|
||||
import typer
|
||||
|
||||
import veritext
|
||||
from veritext.cli.benchmark import benchmark_app
|
||||
from veritext.cli.validate import validate
|
||||
|
||||
app = typer.Typer(
|
||||
name="veritext",
|
||||
help="Semantic text validation framework.",
|
||||
no_args_is_help=True,
|
||||
)
|
||||
|
||||
# Register commands
|
||||
app.command()(validate)
|
||||
app.add_typer(benchmark_app)
|
||||
|
||||
|
||||
@app.callback(invoke_without_command=True)
|
||||
def main(
|
||||
version: bool | None = typer.Option(
|
||||
None,
|
||||
"--version",
|
||||
"-V",
|
||||
help="Show version and exit.",
|
||||
is_eager=True,
|
||||
),
|
||||
) -> None:
|
||||
"""Veritext: Semantic text validation framework for Python."""
|
||||
if version:
|
||||
typer.echo(f"veritext {veritext.__version__}")
|
||||
raise typer.Exit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
120
src/veritext/cli/readers.py
Normal file
120
src/veritext/cli/readers.py
Normal file
@@ -0,0 +1,120 @@
|
||||
"""Input readers for CLI operations."""
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextPair:
|
||||
"""A candidate-reference text pair for validation."""
|
||||
|
||||
candidate: str
|
||||
reference: str
|
||||
|
||||
|
||||
def read_jsonl(path: Path) -> list[TextPair]:
|
||||
"""
|
||||
Read text pairs from a JSONL file.
|
||||
|
||||
Each line must be a JSON object with 'candidate' and 'reference' keys.
|
||||
|
||||
Args:
|
||||
path: Path to the JSONL file.
|
||||
|
||||
Returns:
|
||||
List of TextPair objects.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file does not exist.
|
||||
ValueError: If any line is malformed or missing required keys.
|
||||
"""
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"File not found: {path}")
|
||||
|
||||
pairs: list[TextPair] = []
|
||||
with path.open() as f:
|
||||
for line_num, line in enumerate(f, start=1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
data = json.loads(line)
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Invalid JSON on line {line_num}: {e}") from e
|
||||
|
||||
if "candidate" not in data:
|
||||
raise ValueError(f"Missing 'candidate' key on line {line_num}")
|
||||
if "reference" not in data:
|
||||
raise ValueError(f"Missing 'reference' key on line {line_num}")
|
||||
|
||||
pairs.append(
|
||||
TextPair(
|
||||
candidate=str(data["candidate"]),
|
||||
reference=str(data["reference"]),
|
||||
)
|
||||
)
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def read_paired_jsonl(candidates_path: Path, references_path: Path) -> list[TextPair]:
|
||||
"""
|
||||
Read text pairs from separate candidate and reference JSONL files.
|
||||
|
||||
Each file should contain one JSON object per line with a 'text' key.
|
||||
|
||||
Args:
|
||||
candidates_path: Path to the candidates JSONL file.
|
||||
references_path: Path to the references JSONL file.
|
||||
|
||||
Returns:
|
||||
List of TextPair objects.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If either file does not exist.
|
||||
ValueError: If files have different lengths or are malformed.
|
||||
"""
|
||||
candidates = _read_text_jsonl(candidates_path, "candidates")
|
||||
references = _read_text_jsonl(references_path, "references")
|
||||
|
||||
if len(candidates) != len(references):
|
||||
raise ValueError(
|
||||
f"Number of candidates ({len(candidates)}) does not match "
|
||||
f"number of references ({len(references)})"
|
||||
)
|
||||
|
||||
return [
|
||||
TextPair(candidate=c, reference=r)
|
||||
for c, r in zip(candidates, references, strict=True)
|
||||
]
|
||||
|
||||
|
||||
def _read_text_jsonl(path: Path, label: str) -> list[str]:
|
||||
"""Read text values from a JSONL file with 'text' key per line."""
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"{label.capitalize()} file not found: {path}")
|
||||
|
||||
texts: list[str] = []
|
||||
with path.open() as f:
|
||||
for line_num, line in enumerate(f, start=1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
data = json.loads(line)
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(
|
||||
f"Invalid JSON in {label} file on line {line_num}: {e}"
|
||||
) from e
|
||||
|
||||
if "text" not in data:
|
||||
raise ValueError(
|
||||
f"Missing 'text' key in {label} file on line {line_num}"
|
||||
)
|
||||
|
||||
texts.append(str(data["text"]))
|
||||
|
||||
return texts
|
||||
213
src/veritext/cli/validate.py
Normal file
213
src/veritext/cli/validate.py
Normal file
@@ -0,0 +1,213 @@
|
||||
"""Validate command for computing text metrics."""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Annotated
|
||||
|
||||
import typer
|
||||
|
||||
from veritext.cli.formatters import console, print_validation_output
|
||||
from veritext.cli.readers import read_jsonl, read_paired_jsonl
|
||||
from veritext.metrics.bleu import Bleu
|
||||
from veritext.metrics.lexical import Lexical
|
||||
from veritext.metrics.rouge import Rouge
|
||||
|
||||
# Available metrics mapped to their computation functions
|
||||
AVAILABLE_METRICS = frozenset(
|
||||
{"bleu", "bleu1", "bleu2", "bleu3", "bleu4", "rouge", "rouge_l", "lexical"}
|
||||
)
|
||||
|
||||
|
||||
def _compute_metrics(
|
||||
candidate: str,
|
||||
reference: str,
|
||||
metric_names: list[str],
|
||||
) -> dict[str, float]:
|
||||
"""Compute requested metrics for a single text pair."""
|
||||
results: dict[str, float] = {}
|
||||
bleu = Bleu()
|
||||
rouge = Rouge()
|
||||
lexical = Lexical()
|
||||
|
||||
for metric in metric_names:
|
||||
if metric == "bleu" or metric == "bleu4":
|
||||
bleu_result = bleu.score(candidate, reference)
|
||||
results["bleu4"] = bleu_result.bleu4
|
||||
elif metric == "bleu1":
|
||||
bleu_result = bleu.score(candidate, reference)
|
||||
results["bleu1"] = bleu_result.bleu1
|
||||
elif metric == "bleu2":
|
||||
bleu_result = bleu.score(candidate, reference)
|
||||
results["bleu2"] = bleu_result.bleu2
|
||||
elif metric == "bleu3":
|
||||
bleu_result = bleu.score(candidate, reference)
|
||||
results["bleu3"] = bleu_result.bleu3
|
||||
elif metric == "rouge" or metric == "rouge_l":
|
||||
rouge_result = rouge.score(candidate, reference)
|
||||
results["rouge_l"] = rouge_result.rouge_l.fmeasure
|
||||
elif metric == "lexical":
|
||||
lexical_result = lexical.score(candidate, reference)
|
||||
results["jaccard"] = lexical_result.jaccard
|
||||
results["token_overlap"] = lexical_result.token_overlap
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _compute_batch_metrics(
|
||||
candidates: list[str],
|
||||
references: list[str],
|
||||
metric_names: list[str],
|
||||
) -> dict[str, float]:
|
||||
"""Compute average metrics for a batch of text pairs."""
|
||||
bleu = Bleu()
|
||||
rouge = Rouge()
|
||||
lexical = Lexical()
|
||||
|
||||
results: dict[str, float] = {}
|
||||
|
||||
for metric in metric_names:
|
||||
if metric == "bleu" or metric == "bleu4":
|
||||
bleu_batch = bleu.batch_score(candidates, references)
|
||||
stats = bleu_batch.stats.get("bleu4")
|
||||
if stats:
|
||||
results["bleu4"] = stats.mean
|
||||
elif metric == "bleu1":
|
||||
bleu_batch = bleu.batch_score(candidates, references)
|
||||
stats = bleu_batch.stats.get("bleu1")
|
||||
if stats:
|
||||
results["bleu1"] = stats.mean
|
||||
elif metric == "bleu2":
|
||||
bleu_batch = bleu.batch_score(candidates, references)
|
||||
stats = bleu_batch.stats.get("bleu2")
|
||||
if stats:
|
||||
results["bleu2"] = stats.mean
|
||||
elif metric == "bleu3":
|
||||
bleu_batch = bleu.batch_score(candidates, references)
|
||||
stats = bleu_batch.stats.get("bleu3")
|
||||
if stats:
|
||||
results["bleu3"] = stats.mean
|
||||
elif metric == "rouge" or metric == "rouge_l":
|
||||
rouge_batch = rouge.batch_score(candidates, references)
|
||||
stats = rouge_batch.stats.get("rouge_l_fmeasure")
|
||||
if stats:
|
||||
results["rouge_l"] = stats.mean
|
||||
elif metric == "lexical":
|
||||
lexical_batch = lexical.batch_score(candidates, references)
|
||||
jaccard_stats = lexical_batch.stats.get("jaccard")
|
||||
overlap_stats = lexical_batch.stats.get("token_overlap")
|
||||
if jaccard_stats:
|
||||
results["jaccard"] = jaccard_stats.mean
|
||||
if overlap_stats:
|
||||
results["token_overlap"] = overlap_stats.mean
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _parse_metrics(metrics_str: str) -> list[str]:
|
||||
"""Parse comma-separated metric names."""
|
||||
metrics = [m.strip().lower() for m in metrics_str.split(",")]
|
||||
|
||||
# Validate metric names
|
||||
invalid = [m for m in metrics if m not in AVAILABLE_METRICS]
|
||||
if invalid:
|
||||
raise typer.BadParameter(
|
||||
f"Unknown metrics: {', '.join(invalid)}. "
|
||||
f"Available: {', '.join(sorted(AVAILABLE_METRICS))}"
|
||||
)
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
def validate(
|
||||
text: Annotated[
|
||||
str | None,
|
||||
typer.Argument(help="Candidate text to validate (inline mode)."),
|
||||
] = None,
|
||||
reference: Annotated[
|
||||
str | None,
|
||||
typer.Option("--reference", "-r", help="Reference text for comparison."),
|
||||
] = None,
|
||||
file: Annotated[
|
||||
Path | None,
|
||||
typer.Option("--file", "-f", help="JSONL file with candidate/reference pairs."),
|
||||
] = None,
|
||||
reference_file: Annotated[
|
||||
Path | None,
|
||||
typer.Option(
|
||||
"--reference-file",
|
||||
"-R",
|
||||
help="Separate JSONL file with references (requires --file).",
|
||||
),
|
||||
] = None,
|
||||
metrics: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
"--metrics",
|
||||
"-m",
|
||||
help="Comma-separated metrics: bleu, bleu1-4, rouge, rouge_l, lexical.",
|
||||
),
|
||||
] = "bleu,rouge",
|
||||
output: Annotated[
|
||||
str,
|
||||
typer.Option("--output", "-o", help="Output format: table, json, or simple."),
|
||||
] = "table",
|
||||
threshold: Annotated[
|
||||
float | None,
|
||||
typer.Option("--threshold", "-t", help="Score threshold for pass/fail status."),
|
||||
] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Validate text quality using various metrics.
|
||||
|
||||
Use inline mode for single texts:
|
||||
veritext validate "text" -r "reference" -m bleu,rouge
|
||||
|
||||
Use file mode for batches:
|
||||
veritext validate -f outputs.jsonl -m bleu,rouge
|
||||
"""
|
||||
# Parse and validate metric names
|
||||
try:
|
||||
metric_names = _parse_metrics(metrics)
|
||||
except typer.BadParameter as e:
|
||||
console.print(f"[red]Error:[/red] {e}")
|
||||
raise typer.Exit(code=1) from e
|
||||
|
||||
# Validate output format
|
||||
if output not in ("table", "json", "simple"):
|
||||
console.print(f"[red]Error:[/red] Invalid output format: {output}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
# Determine mode: inline vs file
|
||||
if file is not None:
|
||||
# File mode
|
||||
try:
|
||||
if reference_file is not None:
|
||||
pairs = read_paired_jsonl(file, reference_file)
|
||||
else:
|
||||
pairs = read_jsonl(file)
|
||||
except (FileNotFoundError, ValueError) as e:
|
||||
console.print(f"[red]Error:[/red] {e}")
|
||||
raise typer.Exit(code=1) from e
|
||||
|
||||
if not pairs:
|
||||
console.print("[yellow]Warning:[/yellow] No text pairs found in file.")
|
||||
raise typer.Exit(code=0)
|
||||
|
||||
candidates = [p.candidate for p in pairs]
|
||||
references = [p.reference for p in pairs]
|
||||
|
||||
results = _compute_batch_metrics(candidates, references, metric_names)
|
||||
console.print(f"[dim]Evaluated {len(pairs)} text pairs.[/dim]\n")
|
||||
|
||||
elif text is not None and reference is not None:
|
||||
# Inline mode
|
||||
results = _compute_metrics(text, reference, metric_names)
|
||||
|
||||
else:
|
||||
# Invalid usage
|
||||
console.print(
|
||||
"[red]Error:[/red] Provide either text and --reference, "
|
||||
"or --file for batch mode."
|
||||
)
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
print_validation_output(results, output, threshold)
|
||||
1
tests/test_cli/__init__.py
Normal file
1
tests/test_cli/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""CLI test suite."""
|
||||
337
tests/test_cli/test_benchmark.py
Normal file
337
tests/test_cli/test_benchmark.py
Normal file
@@ -0,0 +1,337 @@
|
||||
"""Tests for CLI benchmark commands."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from veritext.cli.main import app
|
||||
|
||||
runner = CliRunner()
|
||||
|
||||
|
||||
class TestBenchmarkRun:
|
||||
"""Tests for benchmark run command."""
|
||||
|
||||
def test_benchmark_run_basic(self, tmp_path: Path) -> None:
|
||||
"""Test basic benchmark run."""
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text(
|
||||
'{"candidate": "hello world today", "reference": "hello world today"}\n'
|
||||
'{"candidate": "foo bar baz qux", "reference": "foo bar baz qux"}'
|
||||
)
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(data_file),
|
||||
"-m",
|
||||
"rouge_l,bleu4",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "Benchmark 'test_bench' completed" in result.stdout
|
||||
assert "Samples: 2" in result.stdout
|
||||
assert "rouge_l:" in result.stdout
|
||||
assert "bleu4:" in result.stdout
|
||||
|
||||
def test_benchmark_run_file_not_found(self, tmp_path: Path) -> None:
|
||||
"""Test benchmark run with non-existent file."""
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
"/nonexistent/file.jsonl",
|
||||
"-s",
|
||||
str(tmp_path / "benchmarks"),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
assert "Error" in result.stdout
|
||||
|
||||
def test_benchmark_run_creates_storage(self, tmp_path: Path) -> None:
|
||||
"""Test that benchmark run creates storage directory."""
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
||||
storage_path = tmp_path / "new_benchmarks"
|
||||
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(data_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert storage_path.exists()
|
||||
|
||||
|
||||
class TestBenchmarkShow:
|
||||
"""Tests for benchmark show command."""
|
||||
|
||||
def test_benchmark_show_no_runs(self, tmp_path: Path) -> None:
|
||||
"""Test showing benchmark with no runs."""
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
storage_path.mkdir()
|
||||
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"show",
|
||||
"nonexistent_bench",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "No benchmark runs found" in result.stdout
|
||||
|
||||
def test_benchmark_show_with_runs(self, tmp_path: Path) -> None:
|
||||
"""Test showing benchmark history with runs."""
|
||||
# First create some runs
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text('{"candidate": "hello world", "reference": "hello world"}')
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
|
||||
# Run benchmark twice
|
||||
for _ in range(2):
|
||||
runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(data_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
|
||||
# Show history
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"show",
|
||||
"test_bench",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "Benchmark History" in result.stdout
|
||||
|
||||
def test_benchmark_show_limit(self, tmp_path: Path) -> None:
|
||||
"""Test showing limited benchmark history."""
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
|
||||
# Run benchmark 3 times
|
||||
for _ in range(3):
|
||||
runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(data_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
|
||||
# Show only last 2
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"show",
|
||||
"test_bench",
|
||||
"--last",
|
||||
"2",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
|
||||
|
||||
class TestBenchmarkCheck:
|
||||
"""Tests for benchmark check command."""
|
||||
|
||||
def test_benchmark_check_no_regression(self, tmp_path: Path) -> None:
|
||||
"""Test checking for regression with no regression."""
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text(
|
||||
'{"candidate": "hello world today", "reference": "hello world today"}'
|
||||
)
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
|
||||
# Run benchmark twice with same data (no regression)
|
||||
for _ in range(2):
|
||||
runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(data_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
|
||||
# Check for regression
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"check",
|
||||
"test_bench",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "No regression detected" in result.stdout
|
||||
|
||||
def test_benchmark_check_with_regression(self, tmp_path: Path) -> None:
|
||||
"""Test checking for regression when regression occurs."""
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
|
||||
# First run with good data
|
||||
good_file = tmp_path / "good.jsonl"
|
||||
good_file.write_text(
|
||||
'{"candidate": "hello world today", "reference": "hello world today"}'
|
||||
)
|
||||
runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(good_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
|
||||
# Second run with bad data (regression)
|
||||
bad_file = tmp_path / "bad.jsonl"
|
||||
bad_file.write_text(
|
||||
'{"candidate": "completely different", "reference": "hello world today"}'
|
||||
)
|
||||
runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(bad_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
|
||||
# Check for regression
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"check",
|
||||
"test_bench",
|
||||
"-t",
|
||||
"0.05",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
assert "Regression detected" in result.stdout
|
||||
|
||||
def test_benchmark_check_custom_tolerance(self, tmp_path: Path) -> None:
|
||||
"""Test checking regression with custom tolerance."""
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
|
||||
runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(data_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"check",
|
||||
"test_bench",
|
||||
"--tolerance",
|
||||
"0.10",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "10.00%" in result.stdout
|
||||
|
||||
|
||||
class TestBenchmarkHelp:
|
||||
"""Tests for benchmark help output."""
|
||||
|
||||
def test_benchmark_help(self) -> None:
|
||||
"""Test benchmark help output."""
|
||||
result = runner.invoke(app, ["benchmark", "--help"])
|
||||
assert result.exit_code == 0
|
||||
assert "run" in result.stdout
|
||||
assert "show" in result.stdout
|
||||
assert "check" in result.stdout
|
||||
|
||||
def test_benchmark_run_help(self) -> None:
|
||||
"""Test benchmark run help output."""
|
||||
result = runner.invoke(app, ["benchmark", "run", "--help"])
|
||||
assert result.exit_code == 0
|
||||
assert "--file" in result.stdout
|
||||
assert "--metrics" in result.stdout
|
||||
|
||||
def test_benchmark_show_help(self) -> None:
|
||||
"""Test benchmark show help output."""
|
||||
result = runner.invoke(app, ["benchmark", "show", "--help"])
|
||||
assert result.exit_code == 0
|
||||
assert "--last" in result.stdout
|
||||
|
||||
def test_benchmark_check_help(self) -> None:
|
||||
"""Test benchmark check help output."""
|
||||
result = runner.invoke(app, ["benchmark", "check", "--help"])
|
||||
assert result.exit_code == 0
|
||||
assert "--tolerance" in result.stdout
|
||||
assert "--window" in result.stdout
|
||||
141
tests/test_cli/test_formatters.py
Normal file
141
tests/test_cli/test_formatters.py
Normal file
@@ -0,0 +1,141 @@
|
||||
"""Tests for CLI output formatters."""
|
||||
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from veritext.benchmark.models import BenchmarkRun, RegressionReport
|
||||
from veritext.cli.formatters import (
|
||||
format_benchmark_history,
|
||||
format_regression_report,
|
||||
format_validation_json,
|
||||
format_validation_simple,
|
||||
format_validation_table,
|
||||
)
|
||||
|
||||
|
||||
class TestFormatValidationTable:
|
||||
"""Tests for format_validation_table function."""
|
||||
|
||||
def test_format_empty_results(self) -> None:
|
||||
"""Test formatting empty results."""
|
||||
table = format_validation_table({})
|
||||
assert table.title == "Validation Results"
|
||||
assert table.row_count == 0
|
||||
|
||||
def test_format_single_metric(self) -> None:
|
||||
"""Test formatting a single metric."""
|
||||
results = {"bleu4": 0.8523}
|
||||
table = format_validation_table(results)
|
||||
assert table.row_count == 1
|
||||
|
||||
def test_format_multiple_metrics(self) -> None:
|
||||
"""Test formatting multiple metrics."""
|
||||
results = {"bleu4": 0.85, "rouge_l": 0.92, "jaccard": 0.75}
|
||||
table = format_validation_table(results)
|
||||
assert table.row_count == 3
|
||||
|
||||
def test_format_with_threshold(self) -> None:
|
||||
"""Test formatting with threshold for pass/fail."""
|
||||
results = {"bleu4": 0.85, "rouge_l": 0.45}
|
||||
table = format_validation_table(results, threshold=0.5)
|
||||
# Should have 3 columns: Metric, Score, Status
|
||||
assert table.row_count == 2
|
||||
|
||||
|
||||
class TestFormatValidationJson:
|
||||
"""Tests for format_validation_json function."""
|
||||
|
||||
def test_format_empty_results(self) -> None:
|
||||
"""Test formatting empty results as JSON."""
|
||||
result = format_validation_json({})
|
||||
assert result == "{}"
|
||||
|
||||
def test_format_results(self) -> None:
|
||||
"""Test formatting results as JSON."""
|
||||
results = {"bleu4": 0.85, "rouge_l": 0.92}
|
||||
result = format_validation_json(results)
|
||||
assert '"bleu4": 0.85' in result
|
||||
assert '"rouge_l": 0.92' in result
|
||||
|
||||
|
||||
class TestFormatValidationSimple:
|
||||
"""Tests for format_validation_simple function."""
|
||||
|
||||
def test_format_empty_results(self) -> None:
|
||||
"""Test formatting empty results as simple text."""
|
||||
result = format_validation_simple({})
|
||||
assert result == ""
|
||||
|
||||
def test_format_results(self) -> None:
|
||||
"""Test formatting results as simple text."""
|
||||
results = {"bleu4": 0.8523, "rouge_l": 0.9234}
|
||||
result = format_validation_simple(results)
|
||||
assert "bleu4: 0.8523" in result
|
||||
assert "rouge_l: 0.9234" in result
|
||||
|
||||
|
||||
class TestFormatBenchmarkHistory:
|
||||
"""Tests for format_benchmark_history function."""
|
||||
|
||||
def test_format_empty_history(self) -> None:
|
||||
"""Test formatting empty benchmark history."""
|
||||
table = format_benchmark_history([])
|
||||
assert table.title == "Benchmark History"
|
||||
|
||||
def test_format_single_run(self) -> None:
|
||||
"""Test formatting a single benchmark run."""
|
||||
run = BenchmarkRun(
|
||||
id="test-id",
|
||||
benchmark_name="test",
|
||||
timestamp=datetime(2024, 1, 15, 10, 30, tzinfo=UTC),
|
||||
veritext_version="0.1.0",
|
||||
metrics={"rouge_l": 0.85, "bleu4": 0.72},
|
||||
sample_count=100,
|
||||
)
|
||||
table = format_benchmark_history([run])
|
||||
assert table.row_count == 1
|
||||
|
||||
def test_format_multiple_runs(self) -> None:
|
||||
"""Test formatting multiple benchmark runs."""
|
||||
runs = [
|
||||
BenchmarkRun(
|
||||
id=f"test-id-{i}",
|
||||
benchmark_name="test",
|
||||
timestamp=datetime(2024, 1, i + 1, 10, 30, tzinfo=UTC),
|
||||
veritext_version="0.1.0",
|
||||
metrics={"rouge_l": 0.8 + i * 0.01},
|
||||
sample_count=100,
|
||||
)
|
||||
for i in range(3)
|
||||
]
|
||||
table = format_benchmark_history(runs)
|
||||
assert table.row_count == 3
|
||||
|
||||
|
||||
class TestFormatRegressionReport:
|
||||
"""Tests for format_regression_report function."""
|
||||
|
||||
def test_format_no_regression(self) -> None:
|
||||
"""Test formatting report with no regression."""
|
||||
report = RegressionReport(
|
||||
detected=False,
|
||||
baseline={"rouge_l": 0.85},
|
||||
current={"rouge_l": 0.86},
|
||||
deltas={"rouge_l": 0.01},
|
||||
tolerance=0.05,
|
||||
)
|
||||
panel = format_regression_report(report)
|
||||
assert panel.title == "Regression Check"
|
||||
assert panel.border_style == "green"
|
||||
|
||||
def test_format_with_regression(self) -> None:
|
||||
"""Test formatting report with regression detected."""
|
||||
report = RegressionReport(
|
||||
detected=True,
|
||||
baseline={"rouge_l": 0.85, "bleu4": 0.72},
|
||||
current={"rouge_l": 0.70, "bleu4": 0.70},
|
||||
deltas={"rouge_l": -0.15, "bleu4": -0.02},
|
||||
tolerance=0.05,
|
||||
)
|
||||
panel = format_regression_report(report)
|
||||
assert panel.title == "Regression Check"
|
||||
assert panel.border_style == "red"
|
||||
145
tests/test_cli/test_readers.py
Normal file
145
tests/test_cli/test_readers.py
Normal file
@@ -0,0 +1,145 @@
|
||||
"""Tests for CLI input readers."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from veritext.cli.readers import TextPair, read_jsonl, read_paired_jsonl
|
||||
|
||||
|
||||
class TestTextPair:
|
||||
"""Tests for TextPair dataclass."""
|
||||
|
||||
def test_create_text_pair(self) -> None:
|
||||
"""Test creating a TextPair."""
|
||||
pair = TextPair(candidate="hello", reference="world")
|
||||
assert pair.candidate == "hello"
|
||||
assert pair.reference == "world"
|
||||
|
||||
|
||||
class TestReadJsonl:
|
||||
"""Tests for read_jsonl function."""
|
||||
|
||||
def test_read_valid_jsonl(self, tmp_path: Path) -> None:
|
||||
"""Test reading a valid JSONL file."""
|
||||
data = [
|
||||
{"candidate": "foo", "reference": "bar"},
|
||||
{"candidate": "baz", "reference": "qux"},
|
||||
]
|
||||
jsonl_file = tmp_path / "data.jsonl"
|
||||
jsonl_file.write_text("\n".join(json.dumps(d) for d in data))
|
||||
|
||||
pairs = read_jsonl(jsonl_file)
|
||||
|
||||
assert len(pairs) == 2
|
||||
assert pairs[0].candidate == "foo"
|
||||
assert pairs[0].reference == "bar"
|
||||
assert pairs[1].candidate == "baz"
|
||||
assert pairs[1].reference == "qux"
|
||||
|
||||
def test_read_empty_file(self, tmp_path: Path) -> None:
|
||||
"""Test reading an empty JSONL file."""
|
||||
jsonl_file = tmp_path / "empty.jsonl"
|
||||
jsonl_file.write_text("")
|
||||
|
||||
pairs = read_jsonl(jsonl_file)
|
||||
|
||||
assert pairs == []
|
||||
|
||||
def test_read_file_with_blank_lines(self, tmp_path: Path) -> None:
|
||||
"""Test reading a JSONL file with blank lines."""
|
||||
jsonl_file = tmp_path / "data.jsonl"
|
||||
content = '{"candidate": "a", "reference": "b"}\n\n{"candidate": "c", "reference": "d"}\n'
|
||||
jsonl_file.write_text(content)
|
||||
|
||||
pairs = read_jsonl(jsonl_file)
|
||||
|
||||
assert len(pairs) == 2
|
||||
|
||||
def test_read_file_not_found(self, tmp_path: Path) -> None:
|
||||
"""Test reading a non-existent file."""
|
||||
with pytest.raises(FileNotFoundError):
|
||||
read_jsonl(tmp_path / "nonexistent.jsonl")
|
||||
|
||||
def test_read_invalid_json(self, tmp_path: Path) -> None:
|
||||
"""Test reading a file with invalid JSON."""
|
||||
jsonl_file = tmp_path / "invalid.jsonl"
|
||||
jsonl_file.write_text("not valid json")
|
||||
|
||||
with pytest.raises(ValueError, match="Invalid JSON on line 1"):
|
||||
read_jsonl(jsonl_file)
|
||||
|
||||
def test_read_missing_candidate_key(self, tmp_path: Path) -> None:
|
||||
"""Test reading a file missing the candidate key."""
|
||||
jsonl_file = tmp_path / "data.jsonl"
|
||||
jsonl_file.write_text('{"reference": "bar"}')
|
||||
|
||||
with pytest.raises(ValueError, match="Missing 'candidate' key on line 1"):
|
||||
read_jsonl(jsonl_file)
|
||||
|
||||
def test_read_missing_reference_key(self, tmp_path: Path) -> None:
|
||||
"""Test reading a file missing the reference key."""
|
||||
jsonl_file = tmp_path / "data.jsonl"
|
||||
jsonl_file.write_text('{"candidate": "foo"}')
|
||||
|
||||
with pytest.raises(ValueError, match="Missing 'reference' key on line 1"):
|
||||
read_jsonl(jsonl_file)
|
||||
|
||||
|
||||
class TestReadPairedJsonl:
|
||||
"""Tests for read_paired_jsonl function."""
|
||||
|
||||
def test_read_paired_valid(self, tmp_path: Path) -> None:
|
||||
"""Test reading valid paired JSONL files."""
|
||||
candidates_file = tmp_path / "candidates.jsonl"
|
||||
references_file = tmp_path / "references.jsonl"
|
||||
|
||||
candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}')
|
||||
references_file.write_text('{"text": "baz"}\n{"text": "qux"}')
|
||||
|
||||
pairs = read_paired_jsonl(candidates_file, references_file)
|
||||
|
||||
assert len(pairs) == 2
|
||||
assert pairs[0].candidate == "foo"
|
||||
assert pairs[0].reference == "baz"
|
||||
assert pairs[1].candidate == "bar"
|
||||
assert pairs[1].reference == "qux"
|
||||
|
||||
def test_read_paired_length_mismatch(self, tmp_path: Path) -> None:
|
||||
"""Test reading paired files with different lengths."""
|
||||
candidates_file = tmp_path / "candidates.jsonl"
|
||||
references_file = tmp_path / "references.jsonl"
|
||||
|
||||
candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}')
|
||||
references_file.write_text('{"text": "baz"}')
|
||||
|
||||
with pytest.raises(ValueError, match="does not match"):
|
||||
read_paired_jsonl(candidates_file, references_file)
|
||||
|
||||
def test_read_paired_candidates_not_found(self, tmp_path: Path) -> None:
|
||||
"""Test reading when candidates file doesn't exist."""
|
||||
references_file = tmp_path / "references.jsonl"
|
||||
references_file.write_text('{"text": "baz"}')
|
||||
|
||||
with pytest.raises(FileNotFoundError, match="Candidates file not found"):
|
||||
read_paired_jsonl(tmp_path / "nonexistent.jsonl", references_file)
|
||||
|
||||
def test_read_paired_references_not_found(self, tmp_path: Path) -> None:
|
||||
"""Test reading when references file doesn't exist."""
|
||||
candidates_file = tmp_path / "candidates.jsonl"
|
||||
candidates_file.write_text('{"text": "foo"}')
|
||||
|
||||
with pytest.raises(FileNotFoundError, match="References file not found"):
|
||||
read_paired_jsonl(candidates_file, tmp_path / "nonexistent.jsonl")
|
||||
|
||||
def test_read_paired_missing_text_key(self, tmp_path: Path) -> None:
|
||||
"""Test reading paired files with missing text key."""
|
||||
candidates_file = tmp_path / "candidates.jsonl"
|
||||
references_file = tmp_path / "references.jsonl"
|
||||
|
||||
candidates_file.write_text('{"value": "foo"}')
|
||||
references_file.write_text('{"text": "baz"}')
|
||||
|
||||
with pytest.raises(ValueError, match="Missing 'text' key in candidates file"):
|
||||
read_paired_jsonl(candidates_file, references_file)
|
||||
233
tests/test_cli/test_validate.py
Normal file
233
tests/test_cli/test_validate.py
Normal file
@@ -0,0 +1,233 @@
|
||||
"""Tests for CLI validate command."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from veritext.cli.main import app
|
||||
|
||||
runner = CliRunner()
|
||||
|
||||
|
||||
class TestValidateInline:
|
||||
"""Tests for inline validation mode."""
|
||||
|
||||
def test_validate_inline_basic(self) -> None:
|
||||
"""Test basic inline validation."""
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"validate",
|
||||
"The quick brown fox jumps",
|
||||
"-r",
|
||||
"The quick brown fox jumps",
|
||||
"-m",
|
||||
"bleu",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "bleu4" in result.stdout
|
||||
|
||||
def test_validate_inline_with_rouge(self) -> None:
|
||||
"""Test inline validation with ROUGE metric."""
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"validate",
|
||||
"hello world today",
|
||||
"-r",
|
||||
"hello world here",
|
||||
"-m",
|
||||
"rouge",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "rouge_l" in result.stdout
|
||||
|
||||
def test_validate_inline_with_lexical(self) -> None:
|
||||
"""Test inline validation with lexical metric."""
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"validate",
|
||||
"hello world",
|
||||
"-r",
|
||||
"hello everyone",
|
||||
"-m",
|
||||
"lexical",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "jaccard" in result.stdout
|
||||
assert "token_overlap" in result.stdout
|
||||
|
||||
def test_validate_inline_json_output(self) -> None:
|
||||
"""Test inline validation with JSON output."""
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"validate",
|
||||
"hello world today",
|
||||
"-r",
|
||||
"hello world today",
|
||||
"-m",
|
||||
"bleu",
|
||||
"-o",
|
||||
"json",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
data = json.loads(result.stdout)
|
||||
assert "bleu4" in data
|
||||
|
||||
def test_validate_inline_simple_output(self) -> None:
|
||||
"""Test inline validation with simple output."""
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"validate",
|
||||
"hello world today",
|
||||
"-r",
|
||||
"hello world today",
|
||||
"-m",
|
||||
"rouge",
|
||||
"-o",
|
||||
"simple",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "rouge_l:" in result.stdout
|
||||
|
||||
def test_validate_inline_missing_reference(self) -> None:
|
||||
"""Test inline validation without reference."""
|
||||
result = runner.invoke(
|
||||
app,
|
||||
["validate", "hello world", "-m", "bleu"],
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
assert "Error" in result.stdout
|
||||
|
||||
def test_validate_inline_invalid_metric(self) -> None:
|
||||
"""Test inline validation with invalid metric."""
|
||||
result = runner.invoke(
|
||||
app,
|
||||
["validate", "hello", "-r", "world", "-m", "invalid_metric"],
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
assert "Unknown metrics" in result.stdout
|
||||
|
||||
|
||||
class TestValidateFile:
|
||||
"""Tests for file-based validation mode."""
|
||||
|
||||
def test_validate_file_basic(self, tmp_path: Path) -> None:
|
||||
"""Test basic file-based validation."""
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text(
|
||||
'{"candidate": "hello world today", "reference": "hello world today"}\n'
|
||||
'{"candidate": "foo bar baz", "reference": "foo bar baz"}'
|
||||
)
|
||||
|
||||
result = runner.invoke(
|
||||
app,
|
||||
["validate", "-f", str(data_file), "-m", "bleu"],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "bleu4" in result.stdout
|
||||
assert "Evaluated 2 text pairs" in result.stdout
|
||||
|
||||
def test_validate_file_not_found(self) -> None:
|
||||
"""Test file-based validation with non-existent file."""
|
||||
result = runner.invoke(
|
||||
app,
|
||||
["validate", "-f", "/nonexistent/file.jsonl", "-m", "bleu"],
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
assert "Error" in result.stdout
|
||||
|
||||
def test_validate_paired_files(self, tmp_path: Path) -> None:
|
||||
"""Test validation with separate candidate and reference files."""
|
||||
candidates_file = tmp_path / "candidates.jsonl"
|
||||
references_file = tmp_path / "references.jsonl"
|
||||
|
||||
candidates_file.write_text(
|
||||
'{"text": "hello world today"}\n{"text": "foo bar baz"}'
|
||||
)
|
||||
references_file.write_text(
|
||||
'{"text": "hello world today"}\n{"text": "foo bar baz"}'
|
||||
)
|
||||
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"validate",
|
||||
"-f",
|
||||
str(candidates_file),
|
||||
"-R",
|
||||
str(references_file),
|
||||
"-m",
|
||||
"bleu",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "Evaluated 2 text pairs" in result.stdout
|
||||
|
||||
|
||||
class TestValidateOptions:
|
||||
"""Tests for validate command options."""
|
||||
|
||||
def test_validate_with_threshold(self) -> None:
|
||||
"""Test validation with threshold option."""
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"validate",
|
||||
"hello world today",
|
||||
"-r",
|
||||
"hello world today",
|
||||
"-m",
|
||||
"bleu",
|
||||
"-t",
|
||||
"0.5",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
# Table output should include Status column
|
||||
assert "Status" in result.stdout or "PASS" in result.stdout
|
||||
|
||||
def test_validate_invalid_output_format(self) -> None:
|
||||
"""Test validation with invalid output format."""
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"validate",
|
||||
"hello",
|
||||
"-r",
|
||||
"world",
|
||||
"-m",
|
||||
"bleu",
|
||||
"-o",
|
||||
"invalid",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
assert "Invalid output format" in result.stdout
|
||||
|
||||
def test_validate_multiple_metrics(self) -> None:
|
||||
"""Test validation with multiple metrics."""
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"validate",
|
||||
"The quick brown fox",
|
||||
"-r",
|
||||
"The quick brown fox",
|
||||
"-m",
|
||||
"bleu,rouge,lexical",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "bleu4" in result.stdout
|
||||
assert "rouge_l" in result.stdout
|
||||
assert "jaccard" in result.stdout
|
||||
Reference in New Issue
Block a user