"""Tests for CLI benchmark commands.""" from pathlib import Path from typer.testing import CliRunner from veritext.cli.main import app runner = CliRunner() class TestBenchmarkRun: """Tests for benchmark run command.""" def test_benchmark_run_basic(self, tmp_path: Path) -> None: """Test basic benchmark run.""" data_file = tmp_path / "data.jsonl" data_file.write_text( '{"candidate": "hello world today", "reference": "hello world today"}\n' '{"candidate": "foo bar baz qux", "reference": "foo bar baz qux"}' ) storage_path = tmp_path / "benchmarks" result = runner.invoke( app, [ "benchmark", "run", "test_bench", "-f", str(data_file), "-m", "rouge_l,bleu4", "-s", str(storage_path), ], ) assert result.exit_code == 0 assert "Benchmark 'test_bench' completed" in result.stdout assert "Samples: 2" in result.stdout assert "rouge_l:" in result.stdout assert "bleu4:" in result.stdout def test_benchmark_run_file_not_found(self, tmp_path: Path) -> None: """Test benchmark run with non-existent file.""" result = runner.invoke( app, [ "benchmark", "run", "test_bench", "-f", "/nonexistent/file.jsonl", "-s", str(tmp_path / "benchmarks"), ], ) assert result.exit_code == 1 assert "Error" in result.stdout def test_benchmark_run_creates_storage(self, tmp_path: Path) -> None: """Test that benchmark run creates storage directory.""" data_file = tmp_path / "data.jsonl" data_file.write_text('{"candidate": "hello", "reference": "hello"}') storage_path = tmp_path / "new_benchmarks" result = runner.invoke( app, [ "benchmark", "run", "test_bench", "-f", str(data_file), "-s", str(storage_path), ], ) assert result.exit_code == 0 assert storage_path.exists() class TestBenchmarkShow: """Tests for benchmark show command.""" def test_benchmark_show_no_runs(self, tmp_path: Path) -> None: """Test showing benchmark with no runs.""" storage_path = tmp_path / "benchmarks" storage_path.mkdir() result = runner.invoke( app, [ "benchmark", "show", "nonexistent_bench", "-s", str(storage_path), ], ) assert result.exit_code == 0 assert "No benchmark runs found" in result.stdout def test_benchmark_show_with_runs(self, tmp_path: Path) -> None: """Test showing benchmark history with runs.""" # First create some runs data_file = tmp_path / "data.jsonl" data_file.write_text('{"candidate": "hello world", "reference": "hello world"}') storage_path = tmp_path / "benchmarks" # Run benchmark twice for _ in range(2): runner.invoke( app, [ "benchmark", "run", "test_bench", "-f", str(data_file), "-s", str(storage_path), ], ) # Show history result = runner.invoke( app, [ "benchmark", "show", "test_bench", "-s", str(storage_path), ], ) assert result.exit_code == 0 assert "Benchmark History" in result.stdout def test_benchmark_show_limit(self, tmp_path: Path) -> None: """Test showing limited benchmark history.""" data_file = tmp_path / "data.jsonl" data_file.write_text('{"candidate": "hello", "reference": "hello"}') storage_path = tmp_path / "benchmarks" # Run benchmark 3 times for _ in range(3): runner.invoke( app, [ "benchmark", "run", "test_bench", "-f", str(data_file), "-s", str(storage_path), ], ) # Show only last 2 result = runner.invoke( app, [ "benchmark", "show", "test_bench", "--last", "2", "-s", str(storage_path), ], ) assert result.exit_code == 0 class TestBenchmarkCheck: """Tests for benchmark check command.""" def test_benchmark_check_no_regression(self, tmp_path: Path) -> None: """Test checking for regression with no regression.""" data_file = tmp_path / "data.jsonl" data_file.write_text( '{"candidate": "hello world today", "reference": "hello world today"}' ) storage_path = tmp_path / "benchmarks" # Run benchmark twice with same data (no regression) for _ in range(2): runner.invoke( app, [ "benchmark", "run", "test_bench", "-f", str(data_file), "-s", str(storage_path), ], ) # Check for regression result = runner.invoke( app, [ "benchmark", "check", "test_bench", "-s", str(storage_path), ], ) assert result.exit_code == 0 assert "No regression detected" in result.stdout def test_benchmark_check_with_regression(self, tmp_path: Path) -> None: """Test checking for regression when regression occurs.""" storage_path = tmp_path / "benchmarks" # First run with good data good_file = tmp_path / "good.jsonl" good_file.write_text( '{"candidate": "hello world today", "reference": "hello world today"}' ) runner.invoke( app, [ "benchmark", "run", "test_bench", "-f", str(good_file), "-s", str(storage_path), ], ) # Second run with bad data (regression) bad_file = tmp_path / "bad.jsonl" bad_file.write_text( '{"candidate": "completely different", "reference": "hello world today"}' ) runner.invoke( app, [ "benchmark", "run", "test_bench", "-f", str(bad_file), "-s", str(storage_path), ], ) # Check for regression result = runner.invoke( app, [ "benchmark", "check", "test_bench", "-t", "0.05", "-s", str(storage_path), ], ) assert result.exit_code == 1 assert "Regression detected" in result.stdout def test_benchmark_check_custom_tolerance(self, tmp_path: Path) -> None: """Test checking regression with custom tolerance.""" data_file = tmp_path / "data.jsonl" data_file.write_text('{"candidate": "hello", "reference": "hello"}') storage_path = tmp_path / "benchmarks" runner.invoke( app, [ "benchmark", "run", "test_bench", "-f", str(data_file), "-s", str(storage_path), ], ) result = runner.invoke( app, [ "benchmark", "check", "test_bench", "--tolerance", "0.10", "-s", str(storage_path), ], ) assert result.exit_code == 0 assert "10.00%" in result.stdout class TestBenchmarkHelp: """Tests for benchmark help output.""" def test_benchmark_help(self) -> None: """Test benchmark help output.""" result = runner.invoke(app, ["benchmark", "--help"]) assert result.exit_code == 0 assert "run" in result.stdout assert "show" in result.stdout assert "check" in result.stdout def test_benchmark_run_help(self) -> None: """Test benchmark run help output.""" result = runner.invoke(app, ["benchmark", "run", "--help"]) assert result.exit_code == 0 assert "--file" in result.stdout assert "--metrics" in result.stdout def test_benchmark_show_help(self) -> None: """Test benchmark show help output.""" result = runner.invoke(app, ["benchmark", "show", "--help"]) assert result.exit_code == 0 assert "--last" in result.stdout def test_benchmark_check_help(self) -> None: """Test benchmark check help output.""" result = runner.invoke(app, ["benchmark", "check", "--help"]) assert result.exit_code == 0 assert "--tolerance" in result.stdout assert "--window" in result.stdout