test(cli): add CLI tests

Add comprehensive test suite for validate command, benchmark commands,
input readers, and output formatters using Typer CliRunner.
This commit is contained in:
2026-02-03 18:22:31 +00:00
parent 0cadfd4d23
commit c54f8c3f6f
5 changed files with 857 additions and 0 deletions

View File

@@ -0,0 +1,337 @@
"""Tests for CLI benchmark commands."""
from pathlib import Path
from typer.testing import CliRunner
from veritext.cli.main import app
runner = CliRunner()
class TestBenchmarkRun:
"""Tests for benchmark run command."""
def test_benchmark_run_basic(self, tmp_path: Path) -> None:
"""Test basic benchmark run."""
data_file = tmp_path / "data.jsonl"
data_file.write_text(
'{"candidate": "hello world today", "reference": "hello world today"}\n'
'{"candidate": "foo bar baz qux", "reference": "foo bar baz qux"}'
)
storage_path = tmp_path / "benchmarks"
result = runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(data_file),
"-m",
"rouge_l,bleu4",
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
assert "Benchmark 'test_bench' completed" in result.stdout
assert "Samples: 2" in result.stdout
assert "rouge_l:" in result.stdout
assert "bleu4:" in result.stdout
def test_benchmark_run_file_not_found(self, tmp_path: Path) -> None:
"""Test benchmark run with non-existent file."""
result = runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
"/nonexistent/file.jsonl",
"-s",
str(tmp_path / "benchmarks"),
],
)
assert result.exit_code == 1
assert "Error" in result.stdout
def test_benchmark_run_creates_storage(self, tmp_path: Path) -> None:
"""Test that benchmark run creates storage directory."""
data_file = tmp_path / "data.jsonl"
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
storage_path = tmp_path / "new_benchmarks"
result = runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(data_file),
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
assert storage_path.exists()
class TestBenchmarkShow:
"""Tests for benchmark show command."""
def test_benchmark_show_no_runs(self, tmp_path: Path) -> None:
"""Test showing benchmark with no runs."""
storage_path = tmp_path / "benchmarks"
storage_path.mkdir()
result = runner.invoke(
app,
[
"benchmark",
"show",
"nonexistent_bench",
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
assert "No benchmark runs found" in result.stdout
def test_benchmark_show_with_runs(self, tmp_path: Path) -> None:
"""Test showing benchmark history with runs."""
# First create some runs
data_file = tmp_path / "data.jsonl"
data_file.write_text('{"candidate": "hello world", "reference": "hello world"}')
storage_path = tmp_path / "benchmarks"
# Run benchmark twice
for _ in range(2):
runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(data_file),
"-s",
str(storage_path),
],
)
# Show history
result = runner.invoke(
app,
[
"benchmark",
"show",
"test_bench",
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
assert "Benchmark History" in result.stdout
def test_benchmark_show_limit(self, tmp_path: Path) -> None:
"""Test showing limited benchmark history."""
data_file = tmp_path / "data.jsonl"
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
storage_path = tmp_path / "benchmarks"
# Run benchmark 3 times
for _ in range(3):
runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(data_file),
"-s",
str(storage_path),
],
)
# Show only last 2
result = runner.invoke(
app,
[
"benchmark",
"show",
"test_bench",
"--last",
"2",
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
class TestBenchmarkCheck:
"""Tests for benchmark check command."""
def test_benchmark_check_no_regression(self, tmp_path: Path) -> None:
"""Test checking for regression with no regression."""
data_file = tmp_path / "data.jsonl"
data_file.write_text(
'{"candidate": "hello world today", "reference": "hello world today"}'
)
storage_path = tmp_path / "benchmarks"
# Run benchmark twice with same data (no regression)
for _ in range(2):
runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(data_file),
"-s",
str(storage_path),
],
)
# Check for regression
result = runner.invoke(
app,
[
"benchmark",
"check",
"test_bench",
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
assert "No regression detected" in result.stdout
def test_benchmark_check_with_regression(self, tmp_path: Path) -> None:
"""Test checking for regression when regression occurs."""
storage_path = tmp_path / "benchmarks"
# First run with good data
good_file = tmp_path / "good.jsonl"
good_file.write_text(
'{"candidate": "hello world today", "reference": "hello world today"}'
)
runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(good_file),
"-s",
str(storage_path),
],
)
# Second run with bad data (regression)
bad_file = tmp_path / "bad.jsonl"
bad_file.write_text(
'{"candidate": "completely different", "reference": "hello world today"}'
)
runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(bad_file),
"-s",
str(storage_path),
],
)
# Check for regression
result = runner.invoke(
app,
[
"benchmark",
"check",
"test_bench",
"-t",
"0.05",
"-s",
str(storage_path),
],
)
assert result.exit_code == 1
assert "Regression detected" in result.stdout
def test_benchmark_check_custom_tolerance(self, tmp_path: Path) -> None:
"""Test checking regression with custom tolerance."""
data_file = tmp_path / "data.jsonl"
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
storage_path = tmp_path / "benchmarks"
runner.invoke(
app,
[
"benchmark",
"run",
"test_bench",
"-f",
str(data_file),
"-s",
str(storage_path),
],
)
result = runner.invoke(
app,
[
"benchmark",
"check",
"test_bench",
"--tolerance",
"0.10",
"-s",
str(storage_path),
],
)
assert result.exit_code == 0
assert "10.00%" in result.stdout
class TestBenchmarkHelp:
"""Tests for benchmark help output."""
def test_benchmark_help(self) -> None:
"""Test benchmark help output."""
result = runner.invoke(app, ["benchmark", "--help"])
assert result.exit_code == 0
assert "run" in result.stdout
assert "show" in result.stdout
assert "check" in result.stdout
def test_benchmark_run_help(self) -> None:
"""Test benchmark run help output."""
result = runner.invoke(app, ["benchmark", "run", "--help"])
assert result.exit_code == 0
assert "--file" in result.stdout
assert "--metrics" in result.stdout
def test_benchmark_show_help(self) -> None:
"""Test benchmark show help output."""
result = runner.invoke(app, ["benchmark", "show", "--help"])
assert result.exit_code == 0
assert "--last" in result.stdout
def test_benchmark_check_help(self) -> None:
"""Test benchmark check help output."""
result = runner.invoke(app, ["benchmark", "check", "--help"])
assert result.exit_code == 0
assert "--tolerance" in result.stdout
assert "--window" in result.stdout