cli tests
Add comprehensive test suite for validate command, benchmark commands, input readers, and output formatters using Typer CliRunner.
This commit is contained in:
316
tests/test_cli/test_benchmark.py
Normal file
316
tests/test_cli/test_benchmark.py
Normal file
@@ -0,0 +1,316 @@
|
||||
"""Tests for CLI benchmark commands."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from veritext.cli.main import app
|
||||
|
||||
runner = CliRunner()
|
||||
|
||||
|
||||
class TestBenchmarkRun:
|
||||
def test_benchmark_run_basic(self, tmp_path: Path) -> None:
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text(
|
||||
'{"candidate": "hello world today", "reference": "hello world today"}\n'
|
||||
'{"candidate": "foo bar baz qux", "reference": "foo bar baz qux"}'
|
||||
)
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(data_file),
|
||||
"-m",
|
||||
"rouge_l,bleu4",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "Benchmark 'test_bench' completed" in result.stdout
|
||||
assert "Samples: 2" in result.stdout
|
||||
assert "rouge_l:" in result.stdout
|
||||
assert "bleu4:" in result.stdout
|
||||
|
||||
def test_benchmark_run_file_not_found(self, tmp_path: Path) -> None:
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
"/nonexistent/file.jsonl",
|
||||
"-s",
|
||||
str(tmp_path / "benchmarks"),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
assert "Error" in result.stdout
|
||||
|
||||
def test_benchmark_run_creates_storage(self, tmp_path: Path) -> None:
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
||||
storage_path = tmp_path / "new_benchmarks"
|
||||
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(data_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert storage_path.exists()
|
||||
|
||||
|
||||
class TestBenchmarkShow:
|
||||
def test_benchmark_show_no_runs(self, tmp_path: Path) -> None:
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
storage_path.mkdir()
|
||||
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"show",
|
||||
"nonexistent_bench",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "No benchmark runs found" in result.stdout
|
||||
|
||||
def test_benchmark_show_with_runs(self, tmp_path: Path) -> None:
|
||||
# First create some runs
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text('{"candidate": "hello world", "reference": "hello world"}')
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
|
||||
# Run benchmark twice
|
||||
for _ in range(2):
|
||||
runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(data_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
|
||||
# Show history
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"show",
|
||||
"test_bench",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "Benchmark History" in result.stdout
|
||||
|
||||
def test_benchmark_show_limit(self, tmp_path: Path) -> None:
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
|
||||
# Run benchmark 3 times
|
||||
for _ in range(3):
|
||||
runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(data_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
|
||||
# Show only last 2
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"show",
|
||||
"test_bench",
|
||||
"--last",
|
||||
"2",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
|
||||
|
||||
class TestBenchmarkCheck:
|
||||
def test_benchmark_check_no_regression(self, tmp_path: Path) -> None:
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text(
|
||||
'{"candidate": "hello world today", "reference": "hello world today"}'
|
||||
)
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
|
||||
# Run benchmark twice with same data (no regression)
|
||||
for _ in range(2):
|
||||
runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(data_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
|
||||
# Check for regression
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"check",
|
||||
"test_bench",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "No regression detected" in result.stdout
|
||||
|
||||
def test_benchmark_check_with_regression(self, tmp_path: Path) -> None:
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
|
||||
# First run with good data
|
||||
good_file = tmp_path / "good.jsonl"
|
||||
good_file.write_text(
|
||||
'{"candidate": "hello world today", "reference": "hello world today"}'
|
||||
)
|
||||
runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(good_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
|
||||
# Second run with bad data (regression)
|
||||
bad_file = tmp_path / "bad.jsonl"
|
||||
bad_file.write_text(
|
||||
'{"candidate": "completely different", "reference": "hello world today"}'
|
||||
)
|
||||
runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(bad_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
|
||||
# Check for regression
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"check",
|
||||
"test_bench",
|
||||
"-t",
|
||||
"0.05",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
assert "Regression detected" in result.stdout
|
||||
|
||||
def test_benchmark_check_custom_tolerance(self, tmp_path: Path) -> None:
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
|
||||
runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(data_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"check",
|
||||
"test_bench",
|
||||
"--tolerance",
|
||||
"0.10",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "10.00%" in result.stdout
|
||||
|
||||
|
||||
class TestBenchmarkHelp:
|
||||
def test_benchmark_help(self) -> None:
|
||||
result = runner.invoke(app, ["benchmark", "--help"])
|
||||
assert result.exit_code == 0
|
||||
assert "run" in result.stdout
|
||||
assert "show" in result.stdout
|
||||
assert "check" in result.stdout
|
||||
|
||||
def test_benchmark_run_help(self) -> None:
|
||||
result = runner.invoke(app, ["benchmark", "run", "--help"])
|
||||
assert result.exit_code == 0
|
||||
assert "--file" in result.stdout
|
||||
assert "--metrics" in result.stdout
|
||||
|
||||
def test_benchmark_show_help(self) -> None:
|
||||
result = runner.invoke(app, ["benchmark", "show", "--help"])
|
||||
assert result.exit_code == 0
|
||||
assert "--last" in result.stdout
|
||||
|
||||
def test_benchmark_check_help(self) -> None:
|
||||
result = runner.invoke(app, ["benchmark", "check", "--help"])
|
||||
assert result.exit_code == 0
|
||||
assert "--tolerance" in result.stdout
|
||||
assert "--window" in result.stdout
|
||||
Reference in New Issue
Block a user