Add comprehensive test suite for validate command, benchmark commands, input readers, and output formatters using Typer CliRunner.
338 lines
9.7 KiB
Python
338 lines
9.7 KiB
Python
"""Tests for CLI benchmark commands."""
|
|
|
|
from pathlib import Path
|
|
|
|
from typer.testing import CliRunner
|
|
|
|
from veritext.cli.main import app
|
|
|
|
runner = CliRunner()
|
|
|
|
|
|
class TestBenchmarkRun:
|
|
"""Tests for benchmark run command."""
|
|
|
|
def test_benchmark_run_basic(self, tmp_path: Path) -> None:
|
|
"""Test basic benchmark run."""
|
|
data_file = tmp_path / "data.jsonl"
|
|
data_file.write_text(
|
|
'{"candidate": "hello world today", "reference": "hello world today"}\n'
|
|
'{"candidate": "foo bar baz qux", "reference": "foo bar baz qux"}'
|
|
)
|
|
storage_path = tmp_path / "benchmarks"
|
|
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"run",
|
|
"test_bench",
|
|
"-f",
|
|
str(data_file),
|
|
"-m",
|
|
"rouge_l,bleu4",
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "Benchmark 'test_bench' completed" in result.stdout
|
|
assert "Samples: 2" in result.stdout
|
|
assert "rouge_l:" in result.stdout
|
|
assert "bleu4:" in result.stdout
|
|
|
|
def test_benchmark_run_file_not_found(self, tmp_path: Path) -> None:
|
|
"""Test benchmark run with non-existent file."""
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"run",
|
|
"test_bench",
|
|
"-f",
|
|
"/nonexistent/file.jsonl",
|
|
"-s",
|
|
str(tmp_path / "benchmarks"),
|
|
],
|
|
)
|
|
assert result.exit_code == 1
|
|
assert "Error" in result.stdout
|
|
|
|
def test_benchmark_run_creates_storage(self, tmp_path: Path) -> None:
|
|
"""Test that benchmark run creates storage directory."""
|
|
data_file = tmp_path / "data.jsonl"
|
|
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
|
storage_path = tmp_path / "new_benchmarks"
|
|
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"run",
|
|
"test_bench",
|
|
"-f",
|
|
str(data_file),
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert storage_path.exists()
|
|
|
|
|
|
class TestBenchmarkShow:
|
|
"""Tests for benchmark show command."""
|
|
|
|
def test_benchmark_show_no_runs(self, tmp_path: Path) -> None:
|
|
"""Test showing benchmark with no runs."""
|
|
storage_path = tmp_path / "benchmarks"
|
|
storage_path.mkdir()
|
|
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"show",
|
|
"nonexistent_bench",
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "No benchmark runs found" in result.stdout
|
|
|
|
def test_benchmark_show_with_runs(self, tmp_path: Path) -> None:
|
|
"""Test showing benchmark history with runs."""
|
|
# First create some runs
|
|
data_file = tmp_path / "data.jsonl"
|
|
data_file.write_text('{"candidate": "hello world", "reference": "hello world"}')
|
|
storage_path = tmp_path / "benchmarks"
|
|
|
|
# Run benchmark twice
|
|
for _ in range(2):
|
|
runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"run",
|
|
"test_bench",
|
|
"-f",
|
|
str(data_file),
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
|
|
# Show history
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"show",
|
|
"test_bench",
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "Benchmark History" in result.stdout
|
|
|
|
def test_benchmark_show_limit(self, tmp_path: Path) -> None:
|
|
"""Test showing limited benchmark history."""
|
|
data_file = tmp_path / "data.jsonl"
|
|
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
|
storage_path = tmp_path / "benchmarks"
|
|
|
|
# Run benchmark 3 times
|
|
for _ in range(3):
|
|
runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"run",
|
|
"test_bench",
|
|
"-f",
|
|
str(data_file),
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
|
|
# Show only last 2
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"show",
|
|
"test_bench",
|
|
"--last",
|
|
"2",
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
|
|
|
|
class TestBenchmarkCheck:
|
|
"""Tests for benchmark check command."""
|
|
|
|
def test_benchmark_check_no_regression(self, tmp_path: Path) -> None:
|
|
"""Test checking for regression with no regression."""
|
|
data_file = tmp_path / "data.jsonl"
|
|
data_file.write_text(
|
|
'{"candidate": "hello world today", "reference": "hello world today"}'
|
|
)
|
|
storage_path = tmp_path / "benchmarks"
|
|
|
|
# Run benchmark twice with same data (no regression)
|
|
for _ in range(2):
|
|
runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"run",
|
|
"test_bench",
|
|
"-f",
|
|
str(data_file),
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
|
|
# Check for regression
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"check",
|
|
"test_bench",
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "No regression detected" in result.stdout
|
|
|
|
def test_benchmark_check_with_regression(self, tmp_path: Path) -> None:
|
|
"""Test checking for regression when regression occurs."""
|
|
storage_path = tmp_path / "benchmarks"
|
|
|
|
# First run with good data
|
|
good_file = tmp_path / "good.jsonl"
|
|
good_file.write_text(
|
|
'{"candidate": "hello world today", "reference": "hello world today"}'
|
|
)
|
|
runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"run",
|
|
"test_bench",
|
|
"-f",
|
|
str(good_file),
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
|
|
# Second run with bad data (regression)
|
|
bad_file = tmp_path / "bad.jsonl"
|
|
bad_file.write_text(
|
|
'{"candidate": "completely different", "reference": "hello world today"}'
|
|
)
|
|
runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"run",
|
|
"test_bench",
|
|
"-f",
|
|
str(bad_file),
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
|
|
# Check for regression
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"check",
|
|
"test_bench",
|
|
"-t",
|
|
"0.05",
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
assert result.exit_code == 1
|
|
assert "Regression detected" in result.stdout
|
|
|
|
def test_benchmark_check_custom_tolerance(self, tmp_path: Path) -> None:
|
|
"""Test checking regression with custom tolerance."""
|
|
data_file = tmp_path / "data.jsonl"
|
|
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
|
storage_path = tmp_path / "benchmarks"
|
|
|
|
runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"run",
|
|
"test_bench",
|
|
"-f",
|
|
str(data_file),
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"benchmark",
|
|
"check",
|
|
"test_bench",
|
|
"--tolerance",
|
|
"0.10",
|
|
"-s",
|
|
str(storage_path),
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "10.00%" in result.stdout
|
|
|
|
|
|
class TestBenchmarkHelp:
|
|
"""Tests for benchmark help output."""
|
|
|
|
def test_benchmark_help(self) -> None:
|
|
"""Test benchmark help output."""
|
|
result = runner.invoke(app, ["benchmark", "--help"])
|
|
assert result.exit_code == 0
|
|
assert "run" in result.stdout
|
|
assert "show" in result.stdout
|
|
assert "check" in result.stdout
|
|
|
|
def test_benchmark_run_help(self) -> None:
|
|
"""Test benchmark run help output."""
|
|
result = runner.invoke(app, ["benchmark", "run", "--help"])
|
|
assert result.exit_code == 0
|
|
assert "--file" in result.stdout
|
|
assert "--metrics" in result.stdout
|
|
|
|
def test_benchmark_show_help(self) -> None:
|
|
"""Test benchmark show help output."""
|
|
result = runner.invoke(app, ["benchmark", "show", "--help"])
|
|
assert result.exit_code == 0
|
|
assert "--last" in result.stdout
|
|
|
|
def test_benchmark_check_help(self) -> None:
|
|
"""Test benchmark check help output."""
|
|
result = runner.invoke(app, ["benchmark", "check", "--help"])
|
|
assert result.exit_code == 0
|
|
assert "--tolerance" in result.stdout
|
|
assert "--window" in result.stdout
|