test(cli): add CLI tests

Add comprehensive test suite for validate command, benchmark commands,
input readers, and output formatters using Typer CliRunner.
This commit is contained in:
2026-02-03 18:22:31 +00:00
parent 0cadfd4d23
commit c54f8c3f6f
5 changed files with 857 additions and 0 deletions

View File

@@ -0,0 +1 @@
"""CLI test suite."""

View File

@@ -0,0 +1,337 @@
"""Tests for CLI benchmark commands."""
from pathlib import Path
from typer.testing import CliRunner
from veritext.cli.main import app
runner = CliRunner()
class TestBenchmarkRun:
"""Tests for benchmark run command."""
def test_benchmark_run_basic(self, tmp_path: Path) -> None:
"""Test basic benchmark run."""
data_file = tmp_path / "data.jsonl"
data_file.write_text(
'{"candidate": "hello world today", "reference": "hello world today"}\n'
'{"candidate": "foo bar baz qux", "reference": "foo bar baz qux"}'
)
storage_path = tmp_path / "benchmarks"
result = runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(data_file),
"-m",
"rouge_l,bleu4",
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
assert "Benchmark 'test_bench' completed" in result.stdout
assert "Samples: 2" in result.stdout
assert "rouge_l:" in result.stdout
assert "bleu4:" in result.stdout
def test_benchmark_run_file_not_found(self, tmp_path: Path) -> None:
"""Test benchmark run with non-existent file."""
result = runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
"/nonexistent/file.jsonl",
"-s",
str(tmp_path / "benchmarks"),
],
)
assert result.exit_code == 1
assert "Error" in result.stdout
def test_benchmark_run_creates_storage(self, tmp_path: Path) -> None:
"""Test that benchmark run creates storage directory."""
data_file = tmp_path / "data.jsonl"
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
storage_path = tmp_path / "new_benchmarks"
result = runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(data_file),
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
assert storage_path.exists()
class TestBenchmarkShow:
"""Tests for benchmark show command."""
def test_benchmark_show_no_runs(self, tmp_path: Path) -> None:
"""Test showing benchmark with no runs."""
storage_path = tmp_path / "benchmarks"
storage_path.mkdir()
result = runner.invoke(
app,
[
"benchmark",
"show",
"nonexistent_bench",
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
assert "No benchmark runs found" in result.stdout
def test_benchmark_show_with_runs(self, tmp_path: Path) -> None:
"""Test showing benchmark history with runs."""
# First create some runs
data_file = tmp_path / "data.jsonl"
data_file.write_text('{"candidate": "hello world", "reference": "hello world"}')
storage_path = tmp_path / "benchmarks"
# Run benchmark twice
for _ in range(2):
runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(data_file),
"-s",
str(storage_path),
],
)
# Show history
result = runner.invoke(
app,
[
"benchmark",
"show",
"test_bench",
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
assert "Benchmark History" in result.stdout
def test_benchmark_show_limit(self, tmp_path: Path) -> None:
"""Test showing limited benchmark history."""
data_file = tmp_path / "data.jsonl"
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
storage_path = tmp_path / "benchmarks"
# Run benchmark 3 times
for _ in range(3):
runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(data_file),
"-s",
str(storage_path),
],
)
# Show only last 2
result = runner.invoke(
app,
[
"benchmark",
"show",
"test_bench",
"--last",
"2",
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
class TestBenchmarkCheck:
"""Tests for benchmark check command."""
def test_benchmark_check_no_regression(self, tmp_path: Path) -> None:
"""Test checking for regression with no regression."""
data_file = tmp_path / "data.jsonl"
data_file.write_text(
'{"candidate": "hello world today", "reference": "hello world today"}'
)
storage_path = tmp_path / "benchmarks"
# Run benchmark twice with same data (no regression)
for _ in range(2):
runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(data_file),
"-s",
str(storage_path),
],
)
# Check for regression
result = runner.invoke(
app,
[
"benchmark",
"check",
"test_bench",
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
assert "No regression detected" in result.stdout
def test_benchmark_check_with_regression(self, tmp_path: Path) -> None:
"""Test checking for regression when regression occurs."""
storage_path = tmp_path / "benchmarks"
# First run with good data
good_file = tmp_path / "good.jsonl"
good_file.write_text(
'{"candidate": "hello world today", "reference": "hello world today"}'
)
runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(good_file),
"-s",
str(storage_path),
],
)
# Second run with bad data (regression)
bad_file = tmp_path / "bad.jsonl"
bad_file.write_text(
'{"candidate": "completely different", "reference": "hello world today"}'
)
runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(bad_file),
"-s",
str(storage_path),
],
)
# Check for regression
result = runner.invoke(
app,
[
"benchmark",
"check",
"test_bench",
"-t",
"0.05",
"-s",
str(storage_path),
],
)
assert result.exit_code == 1
assert "Regression detected" in result.stdout
def test_benchmark_check_custom_tolerance(self, tmp_path: Path) -> None:
"""Test checking regression with custom tolerance."""
data_file = tmp_path / "data.jsonl"
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
storage_path = tmp_path / "benchmarks"
runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(data_file),
"-s",
str(storage_path),
],
)
result = runner.invoke(
app,
[
"benchmark",
"check",
"test_bench",
"--tolerance",
"0.10",
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
assert "10.00%" in result.stdout
class TestBenchmarkHelp:
"""Tests for benchmark help output."""
def test_benchmark_help(self) -> None:
"""Test benchmark help output."""
result = runner.invoke(app, ["benchmark", "--help"])
assert result.exit_code == 0
assert "run" in result.stdout
assert "show" in result.stdout
assert "check" in result.stdout
def test_benchmark_run_help(self) -> None:
"""Test benchmark run help output."""
result = runner.invoke(app, ["benchmark", "run", "--help"])
assert result.exit_code == 0
assert "--file" in result.stdout
assert "--metrics" in result.stdout
def test_benchmark_show_help(self) -> None:
"""Test benchmark show help output."""
result = runner.invoke(app, ["benchmark", "show", "--help"])
assert result.exit_code == 0
assert "--last" in result.stdout
def test_benchmark_check_help(self) -> None:
"""Test benchmark check help output."""
result = runner.invoke(app, ["benchmark", "check", "--help"])
assert result.exit_code == 0
assert "--tolerance" in result.stdout
assert "--window" in result.stdout

View File

@@ -0,0 +1,141 @@
"""Tests for CLI output formatters."""
from datetime import UTC, datetime
from veritext.benchmark.models import BenchmarkRun, RegressionReport
from veritext.cli.formatters import (
format_benchmark_history,
format_regression_report,
format_validation_json,
format_validation_simple,
format_validation_table,
)
class TestFormatValidationTable:
"""Tests for format_validation_table function."""
def test_format_empty_results(self) -> None:
"""Test formatting empty results."""
table = format_validation_table({})
assert table.title == "Validation Results"
assert table.row_count == 0
def test_format_single_metric(self) -> None:
"""Test formatting a single metric."""
results = {"bleu4": 0.8523}
table = format_validation_table(results)
assert table.row_count == 1
def test_format_multiple_metrics(self) -> None:
"""Test formatting multiple metrics."""
results = {"bleu4": 0.85, "rouge_l": 0.92, "jaccard": 0.75}
table = format_validation_table(results)
assert table.row_count == 3
def test_format_with_threshold(self) -> None:
"""Test formatting with threshold for pass/fail."""
results = {"bleu4": 0.85, "rouge_l": 0.45}
table = format_validation_table(results, threshold=0.5)
# Should have 3 columns: Metric, Score, Status
assert table.row_count == 2
class TestFormatValidationJson:
"""Tests for format_validation_json function."""
def test_format_empty_results(self) -> None:
"""Test formatting empty results as JSON."""
result = format_validation_json({})
assert result == "{}"
def test_format_results(self) -> None:
"""Test formatting results as JSON."""
results = {"bleu4": 0.85, "rouge_l": 0.92}
result = format_validation_json(results)
assert '"bleu4": 0.85' in result
assert '"rouge_l": 0.92' in result
class TestFormatValidationSimple:
"""Tests for format_validation_simple function."""
def test_format_empty_results(self) -> None:
"""Test formatting empty results as simple text."""
result = format_validation_simple({})
assert result == ""
def test_format_results(self) -> None:
"""Test formatting results as simple text."""
results = {"bleu4": 0.8523, "rouge_l": 0.9234}
result = format_validation_simple(results)
assert "bleu4: 0.8523" in result
assert "rouge_l: 0.9234" in result
class TestFormatBenchmarkHistory:
"""Tests for format_benchmark_history function."""
def test_format_empty_history(self) -> None:
"""Test formatting empty benchmark history."""
table = format_benchmark_history([])
assert table.title == "Benchmark History"
def test_format_single_run(self) -> None:
"""Test formatting a single benchmark run."""
run = BenchmarkRun(
id="test-id",
benchmark_name="test",
timestamp=datetime(2024, 1, 15, 10, 30, tzinfo=UTC),
veritext_version="0.1.0",
metrics={"rouge_l": 0.85, "bleu4": 0.72},
sample_count=100,
)
table = format_benchmark_history([run])
assert table.row_count == 1
def test_format_multiple_runs(self) -> None:
"""Test formatting multiple benchmark runs."""
runs = [
BenchmarkRun(
id=f"test-id-{i}",
benchmark_name="test",
timestamp=datetime(2024, 1, i + 1, 10, 30, tzinfo=UTC),
veritext_version="0.1.0",
metrics={"rouge_l": 0.8 + i * 0.01},
sample_count=100,
)
for i in range(3)
]
table = format_benchmark_history(runs)
assert table.row_count == 3
class TestFormatRegressionReport:
"""Tests for format_regression_report function."""
def test_format_no_regression(self) -> None:
"""Test formatting report with no regression."""
report = RegressionReport(
detected=False,
baseline={"rouge_l": 0.85},
current={"rouge_l": 0.86},
deltas={"rouge_l": 0.01},
tolerance=0.05,
)
panel = format_regression_report(report)
assert panel.title == "Regression Check"
assert panel.border_style == "green"
def test_format_with_regression(self) -> None:
"""Test formatting report with regression detected."""
report = RegressionReport(
detected=True,
baseline={"rouge_l": 0.85, "bleu4": 0.72},
current={"rouge_l": 0.70, "bleu4": 0.70},
deltas={"rouge_l": -0.15, "bleu4": -0.02},
tolerance=0.05,
)
panel = format_regression_report(report)
assert panel.title == "Regression Check"
assert panel.border_style == "red"

View File

@@ -0,0 +1,145 @@
"""Tests for CLI input readers."""
import json
from pathlib import Path
import pytest
from veritext.cli.readers import TextPair, read_jsonl, read_paired_jsonl
class TestTextPair:
"""Tests for TextPair dataclass."""
def test_create_text_pair(self) -> None:
"""Test creating a TextPair."""
pair = TextPair(candidate="hello", reference="world")
assert pair.candidate == "hello"
assert pair.reference == "world"
class TestReadJsonl:
"""Tests for read_jsonl function."""
def test_read_valid_jsonl(self, tmp_path: Path) -> None:
"""Test reading a valid JSONL file."""
data = [
{"candidate": "foo", "reference": "bar"},
{"candidate": "baz", "reference": "qux"},
]
jsonl_file = tmp_path / "data.jsonl"
jsonl_file.write_text("\n".join(json.dumps(d) for d in data))
pairs = read_jsonl(jsonl_file)
assert len(pairs) == 2
assert pairs[0].candidate == "foo"
assert pairs[0].reference == "bar"
assert pairs[1].candidate == "baz"
assert pairs[1].reference == "qux"
def test_read_empty_file(self, tmp_path: Path) -> None:
"""Test reading an empty JSONL file."""
jsonl_file = tmp_path / "empty.jsonl"
jsonl_file.write_text("")
pairs = read_jsonl(jsonl_file)
assert pairs == []
def test_read_file_with_blank_lines(self, tmp_path: Path) -> None:
"""Test reading a JSONL file with blank lines."""
jsonl_file = tmp_path / "data.jsonl"
content = '{"candidate": "a", "reference": "b"}\n\n{"candidate": "c", "reference": "d"}\n'
jsonl_file.write_text(content)
pairs = read_jsonl(jsonl_file)
assert len(pairs) == 2
def test_read_file_not_found(self, tmp_path: Path) -> None:
"""Test reading a non-existent file."""
with pytest.raises(FileNotFoundError):
read_jsonl(tmp_path / "nonexistent.jsonl")
def test_read_invalid_json(self, tmp_path: Path) -> None:
"""Test reading a file with invalid JSON."""
jsonl_file = tmp_path / "invalid.jsonl"
jsonl_file.write_text("not valid json")
with pytest.raises(ValueError, match="Invalid JSON on line 1"):
read_jsonl(jsonl_file)
def test_read_missing_candidate_key(self, tmp_path: Path) -> None:
"""Test reading a file missing the candidate key."""
jsonl_file = tmp_path / "data.jsonl"
jsonl_file.write_text('{"reference": "bar"}')
with pytest.raises(ValueError, match="Missing 'candidate' key on line 1"):
read_jsonl(jsonl_file)
def test_read_missing_reference_key(self, tmp_path: Path) -> None:
"""Test reading a file missing the reference key."""
jsonl_file = tmp_path / "data.jsonl"
jsonl_file.write_text('{"candidate": "foo"}')
with pytest.raises(ValueError, match="Missing 'reference' key on line 1"):
read_jsonl(jsonl_file)
class TestReadPairedJsonl:
"""Tests for read_paired_jsonl function."""
def test_read_paired_valid(self, tmp_path: Path) -> None:
"""Test reading valid paired JSONL files."""
candidates_file = tmp_path / "candidates.jsonl"
references_file = tmp_path / "references.jsonl"
candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}')
references_file.write_text('{"text": "baz"}\n{"text": "qux"}')
pairs = read_paired_jsonl(candidates_file, references_file)
assert len(pairs) == 2
assert pairs[0].candidate == "foo"
assert pairs[0].reference == "baz"
assert pairs[1].candidate == "bar"
assert pairs[1].reference == "qux"
def test_read_paired_length_mismatch(self, tmp_path: Path) -> None:
"""Test reading paired files with different lengths."""
candidates_file = tmp_path / "candidates.jsonl"
references_file = tmp_path / "references.jsonl"
candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}')
references_file.write_text('{"text": "baz"}')
with pytest.raises(ValueError, match="does not match"):
read_paired_jsonl(candidates_file, references_file)
def test_read_paired_candidates_not_found(self, tmp_path: Path) -> None:
"""Test reading when candidates file doesn't exist."""
references_file = tmp_path / "references.jsonl"
references_file.write_text('{"text": "baz"}')
with pytest.raises(FileNotFoundError, match="Candidates file not found"):
read_paired_jsonl(tmp_path / "nonexistent.jsonl", references_file)
def test_read_paired_references_not_found(self, tmp_path: Path) -> None:
"""Test reading when references file doesn't exist."""
candidates_file = tmp_path / "candidates.jsonl"
candidates_file.write_text('{"text": "foo"}')
with pytest.raises(FileNotFoundError, match="References file not found"):
read_paired_jsonl(candidates_file, tmp_path / "nonexistent.jsonl")
def test_read_paired_missing_text_key(self, tmp_path: Path) -> None:
"""Test reading paired files with missing text key."""
candidates_file = tmp_path / "candidates.jsonl"
references_file = tmp_path / "references.jsonl"
candidates_file.write_text('{"value": "foo"}')
references_file.write_text('{"text": "baz"}')
with pytest.raises(ValueError, match="Missing 'text' key in candidates file"):
read_paired_jsonl(candidates_file, references_file)

View File

@@ -0,0 +1,233 @@
"""Tests for CLI validate command."""
import json
from pathlib import Path
from typer.testing import CliRunner
from veritext.cli.main import app
runner = CliRunner()
class TestValidateInline:
"""Tests for inline validation mode."""
def test_validate_inline_basic(self) -> None:
"""Test basic inline validation."""
result = runner.invoke(
app,
[
"validate",
"The quick brown fox jumps",
"-r",
"The quick brown fox jumps",
"-m",
"bleu",
],
)
assert result.exit_code == 0
assert "bleu4" in result.stdout
def test_validate_inline_with_rouge(self) -> None:
"""Test inline validation with ROUGE metric."""
result = runner.invoke(
app,
[
"validate",
"hello world today",
"-r",
"hello world here",
"-m",
"rouge",
],
)
assert result.exit_code == 0
assert "rouge_l" in result.stdout
def test_validate_inline_with_lexical(self) -> None:
"""Test inline validation with lexical metric."""
result = runner.invoke(
app,
[
"validate",
"hello world",
"-r",
"hello everyone",
"-m",
"lexical",
],
)
assert result.exit_code == 0
assert "jaccard" in result.stdout
assert "token_overlap" in result.stdout
def test_validate_inline_json_output(self) -> None:
"""Test inline validation with JSON output."""
result = runner.invoke(
app,
[
"validate",
"hello world today",
"-r",
"hello world today",
"-m",
"bleu",
"-o",
"json",
],
)
assert result.exit_code == 0
data = json.loads(result.stdout)
assert "bleu4" in data
def test_validate_inline_simple_output(self) -> None:
"""Test inline validation with simple output."""
result = runner.invoke(
app,
[
"validate",
"hello world today",
"-r",
"hello world today",
"-m",
"rouge",
"-o",
"simple",
],
)
assert result.exit_code == 0
assert "rouge_l:" in result.stdout
def test_validate_inline_missing_reference(self) -> None:
"""Test inline validation without reference."""
result = runner.invoke(
app,
["validate", "hello world", "-m", "bleu"],
)
assert result.exit_code == 1
assert "Error" in result.stdout
def test_validate_inline_invalid_metric(self) -> None:
"""Test inline validation with invalid metric."""
result = runner.invoke(
app,
["validate", "hello", "-r", "world", "-m", "invalid_metric"],
)
assert result.exit_code == 1
assert "Unknown metrics" in result.stdout
class TestValidateFile:
"""Tests for file-based validation mode."""
def test_validate_file_basic(self, tmp_path: Path) -> None:
"""Test basic file-based validation."""
data_file = tmp_path / "data.jsonl"
data_file.write_text(
'{"candidate": "hello world today", "reference": "hello world today"}\n'
'{"candidate": "foo bar baz", "reference": "foo bar baz"}'
)
result = runner.invoke(
app,
["validate", "-f", str(data_file), "-m", "bleu"],
)
assert result.exit_code == 0
assert "bleu4" in result.stdout
assert "Evaluated 2 text pairs" in result.stdout
def test_validate_file_not_found(self) -> None:
"""Test file-based validation with non-existent file."""
result = runner.invoke(
app,
["validate", "-f", "/nonexistent/file.jsonl", "-m", "bleu"],
)
assert result.exit_code == 1
assert "Error" in result.stdout
def test_validate_paired_files(self, tmp_path: Path) -> None:
"""Test validation with separate candidate and reference files."""
candidates_file = tmp_path / "candidates.jsonl"
references_file = tmp_path / "references.jsonl"
candidates_file.write_text(
'{"text": "hello world today"}\n{"text": "foo bar baz"}'
)
references_file.write_text(
'{"text": "hello world today"}\n{"text": "foo bar baz"}'
)
result = runner.invoke(
app,
[
"validate",
"-f",
str(candidates_file),
"-R",
str(references_file),
"-m",
"bleu",
],
)
assert result.exit_code == 0
assert "Evaluated 2 text pairs" in result.stdout
class TestValidateOptions:
"""Tests for validate command options."""
def test_validate_with_threshold(self) -> None:
"""Test validation with threshold option."""
result = runner.invoke(
app,
[
"validate",
"hello world today",
"-r",
"hello world today",
"-m",
"bleu",
"-t",
"0.5",
],
)
assert result.exit_code == 0
# Table output should include Status column
assert "Status" in result.stdout or "PASS" in result.stdout
def test_validate_invalid_output_format(self) -> None:
"""Test validation with invalid output format."""
result = runner.invoke(
app,
[
"validate",
"hello",
"-r",
"world",
"-m",
"bleu",
"-o",
"invalid",
],
)
assert result.exit_code == 1
assert "Invalid output format" in result.stdout
def test_validate_multiple_metrics(self) -> None:
"""Test validation with multiple metrics."""
result = runner.invoke(
app,
[
"validate",
"The quick brown fox",
"-r",
"The quick brown fox",
"-m",
"bleu,rouge,lexical",
],
)
assert result.exit_code == 0
assert "bleu4" in result.stdout
assert "rouge_l" in result.stdout
assert "jaccard" in result.stdout