Add comprehensive test suite for validate command, benchmark commands, input readers, and output formatters using Typer CliRunner.
317 lines
8.8 KiB
Python
317 lines
8.8 KiB
Python
"""Tests for CLI benchmark commands."""
|
|
|
|
from pathlib import Path
|
|
|
|
from typer.testing import CliRunner
|
|
|
|
from veritext.cli.main import app
|
|
|
|
runner = CliRunner()
|
|
|
|
|
|
class TestBenchmarkRun:
|
|
def test_benchmark_run_basic(self, tmp_path: Path) -> None:
|
|
data_file = tmp_path / "data.jsonl"
|
|
data_file.write_text(
|
|
'{"candidate": "hello world today", "reference": "hello world today"}\n'
|
|
'{"candidate": "foo bar baz qux", "reference": "foo bar baz qux"}'
|
|
)
|
|
storage_path = tmp_path / "benchmarks"
|
|
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"run",
|
|
"test_bench",
|
|
"-f",
|
|
str(data_file),
|
|
"-m",
|
|
"rouge_l,bleu4",
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "Benchmark 'test_bench' completed" in result.stdout
|
|
assert "Samples: 2" in result.stdout
|
|
assert "rouge_l:" in result.stdout
|
|
assert "bleu4:" in result.stdout
|
|
|
|
def test_benchmark_run_file_not_found(self, tmp_path: Path) -> None:
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"run",
|
|
"test_bench",
|
|
"-f",
|
|
"/nonexistent/file.jsonl",
|
|
"-s",
|
|
str(tmp_path / "benchmarks"),
|
|
],
|
|
)
|
|
assert result.exit_code == 1
|
|
assert "Error" in result.stdout
|
|
|
|
def test_benchmark_run_creates_storage(self, tmp_path: Path) -> None:
|
|
data_file = tmp_path / "data.jsonl"
|
|
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
|
storage_path = tmp_path / "new_benchmarks"
|
|
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"run",
|
|
"test_bench",
|
|
"-f",
|
|
str(data_file),
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert storage_path.exists()
|
|
|
|
|
|
class TestBenchmarkShow:
|
|
def test_benchmark_show_no_runs(self, tmp_path: Path) -> None:
|
|
storage_path = tmp_path / "benchmarks"
|
|
storage_path.mkdir()
|
|
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"show",
|
|
"nonexistent_bench",
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "No benchmark runs found" in result.stdout
|
|
|
|
def test_benchmark_show_with_runs(self, tmp_path: Path) -> None:
|
|
# First create some runs
|
|
data_file = tmp_path / "data.jsonl"
|
|
data_file.write_text('{"candidate": "hello world", "reference": "hello world"}')
|
|
storage_path = tmp_path / "benchmarks"
|
|
|
|
# Run benchmark twice
|
|
for _ in range(2):
|
|
runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"run",
|
|
"test_bench",
|
|
"-f",
|
|
str(data_file),
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
|
|
# Show history
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"show",
|
|
"test_bench",
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "Benchmark History" in result.stdout
|
|
|
|
def test_benchmark_show_limit(self, tmp_path: Path) -> None:
|
|
data_file = tmp_path / "data.jsonl"
|
|
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
|
storage_path = tmp_path / "benchmarks"
|
|
|
|
# Run benchmark 3 times
|
|
for _ in range(3):
|
|
runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"run",
|
|
"test_bench",
|
|
"-f",
|
|
str(data_file),
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
|
|
# Show only last 2
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"show",
|
|
"test_bench",
|
|
"--last",
|
|
"2",
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
|
|
|
|
class TestBenchmarkCheck:
|
|
def test_benchmark_check_no_regression(self, tmp_path: Path) -> None:
|
|
data_file = tmp_path / "data.jsonl"
|
|
data_file.write_text(
|
|
'{"candidate": "hello world today", "reference": "hello world today"}'
|
|
)
|
|
storage_path = tmp_path / "benchmarks"
|
|
|
|
# Run benchmark twice with same data (no regression)
|
|
for _ in range(2):
|
|
runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"run",
|
|
"test_bench",
|
|
"-f",
|
|
str(data_file),
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
|
|
# Check for regression
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"check",
|
|
"test_bench",
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "No regression detected" in result.stdout
|
|
|
|
def test_benchmark_check_with_regression(self, tmp_path: Path) -> None:
|
|
storage_path = tmp_path / "benchmarks"
|
|
|
|
# First run with good data
|
|
good_file = tmp_path / "good.jsonl"
|
|
good_file.write_text(
|
|
'{"candidate": "hello world today", "reference": "hello world today"}'
|
|
)
|
|
runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"run",
|
|
"test_bench",
|
|
"-f",
|
|
str(good_file),
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
|
|
# Second run with bad data (regression)
|
|
bad_file = tmp_path / "bad.jsonl"
|
|
bad_file.write_text(
|
|
'{"candidate": "completely different", "reference": "hello world today"}'
|
|
)
|
|
runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"run",
|
|
"test_bench",
|
|
"-f",
|
|
str(bad_file),
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
|
|
# Check for regression
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"check",
|
|
"test_bench",
|
|
"-t",
|
|
"0.05",
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
assert result.exit_code == 1
|
|
assert "Regression detected" in result.stdout
|
|
|
|
def test_benchmark_check_custom_tolerance(self, tmp_path: Path) -> None:
|
|
data_file = tmp_path / "data.jsonl"
|
|
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
|
storage_path = tmp_path / "benchmarks"
|
|
|
|
runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"run",
|
|
"test_bench",
|
|
"-f",
|
|
str(data_file),
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"check",
|
|
"test_bench",
|
|
"--tolerance",
|
|
"0.10",
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "10.00%" in result.stdout
|
|
|
|
|
|
class TestBenchmarkHelp:
|
|
def test_benchmark_help(self) -> None:
|
|
result = runner.invoke(app, ["benchmark", "--help"])
|
|
assert result.exit_code == 0
|
|
assert "run" in result.stdout
|
|
assert "show" in result.stdout
|
|
assert "check" in result.stdout
|
|
|
|
def test_benchmark_run_help(self) -> None:
|
|
result = runner.invoke(app, ["benchmark", "run", "--help"])
|
|
assert result.exit_code == 0
|
|
assert "--file" in result.stdout
|
|
assert "--metrics" in result.stdout
|
|
|
|
def test_benchmark_show_help(self) -> None:
|
|
result = runner.invoke(app, ["benchmark", "show", "--help"])
|
|
assert result.exit_code == 0
|
|
assert "--last" in result.stdout
|
|
|
|
def test_benchmark_check_help(self) -> None:
|
|
result = runner.invoke(app, ["benchmark", "check", "--help"])
|
|
assert result.exit_code == 0
|
|
assert "--tolerance" in result.stdout
|
|
assert "--window" in result.stdout
|