cli tests

Add comprehensive test suite for validate command, benchmark commands, input readers, and output formatters using Typer CliRunner.
2025-05-11 14:13:30 +00:00
parent 5f619a626b
commit 8511594697
5 changed files with 775 additions and 0 deletions
--- a/tests/test_cli/test_benchmark.py
+++ b/tests/test_cli/test_benchmark.py
@@ -0,0 +1,316 @@
+"""Tests for CLI benchmark commands."""
+
+from pathlib import Path
+
+from typer.testing import CliRunner
+
+from veritext.cli.main import app
+
+runner = CliRunner()
+
+
+class TestBenchmarkRun:
+    def test_benchmark_run_basic(self, tmp_path: Path) -> None:
+        data_file = tmp_path / "data.jsonl"
+        data_file.write_text(
+            '{"candidate": "hello world today", "reference": "hello world today"}\n'
+            '{"candidate": "foo bar baz qux", "reference": "foo bar baz qux"}'
+        )
+        storage_path = tmp_path / "benchmarks"
+
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                "run",
+                "test_bench",
+                "-f",
+                str(data_file),
+                "-m",
+                "rouge_l,bleu4",
+                "-s",
+                str(storage_path),
+            ],
+        )
+        assert result.exit_code == 0
+        assert "Benchmark 'test_bench' completed" in result.stdout
+        assert "Samples: 2" in result.stdout
+        assert "rouge_l:" in result.stdout
+        assert "bleu4:" in result.stdout
+
+    def test_benchmark_run_file_not_found(self, tmp_path: Path) -> None:
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                "run",
+                "test_bench",
+                "-f",
+                "/nonexistent/file.jsonl",
+                "-s",
+                str(tmp_path / "benchmarks"),
+            ],
+        )
+        assert result.exit_code == 1
+        assert "Error" in result.stdout
+
+    def test_benchmark_run_creates_storage(self, tmp_path: Path) -> None:
+        data_file = tmp_path / "data.jsonl"
+        data_file.write_text('{"candidate": "hello", "reference": "hello"}')
+        storage_path = tmp_path / "new_benchmarks"
+
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                "run",
+                "test_bench",
+                "-f",
+                str(data_file),
+                "-s",
+                str(storage_path),
+            ],
+        )
+        assert result.exit_code == 0
+        assert storage_path.exists()
+
+
+class TestBenchmarkShow:
+    def test_benchmark_show_no_runs(self, tmp_path: Path) -> None:
+        storage_path = tmp_path / "benchmarks"
+        storage_path.mkdir()
+
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                "show",
+                "nonexistent_bench",
+                "-s",
+                str(storage_path),
+            ],
+        )
+        assert result.exit_code == 0
+        assert "No benchmark runs found" in result.stdout
+
+    def test_benchmark_show_with_runs(self, tmp_path: Path) -> None:
+        # First create some runs
+        data_file = tmp_path / "data.jsonl"
+        data_file.write_text('{"candidate": "hello world", "reference": "hello world"}')
+        storage_path = tmp_path / "benchmarks"
+
+        # Run benchmark twice
+        for _ in range(2):
+            runner.invoke(
+                app,
+                [
+                    "benchmark",
+                    "run",
+                    "test_bench",
+                    "-f",
+                    str(data_file),
+                    "-s",
+                    str(storage_path),
+                ],
+            )
+
+        # Show history
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                "show",
+                "test_bench",
+                "-s",
+                str(storage_path),
+            ],
+        )
+        assert result.exit_code == 0
+        assert "Benchmark History" in result.stdout
+
+    def test_benchmark_show_limit(self, tmp_path: Path) -> None:
+        data_file = tmp_path / "data.jsonl"
+        data_file.write_text('{"candidate": "hello", "reference": "hello"}')
+        storage_path = tmp_path / "benchmarks"
+
+        # Run benchmark 3 times
+        for _ in range(3):
+            runner.invoke(
+                app,
+                [
+                    "benchmark",
+                    "run",
+                    "test_bench",
+                    "-f",
+                    str(data_file),
+                    "-s",
+                    str(storage_path),
+                ],
+            )
+
+        # Show only last 2
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                "show",
+                "test_bench",
+                "--last",
+                "2",
+                "-s",
+                str(storage_path),
+            ],
+        )
+        assert result.exit_code == 0
+
+
+class TestBenchmarkCheck:
+    def test_benchmark_check_no_regression(self, tmp_path: Path) -> None:
+        data_file = tmp_path / "data.jsonl"
+        data_file.write_text(
+            '{"candidate": "hello world today", "reference": "hello world today"}'
+        )
+        storage_path = tmp_path / "benchmarks"
+
+        # Run benchmark twice with same data (no regression)
+        for _ in range(2):
+            runner.invoke(
+                app,
+                [
+                    "benchmark",
+                    "run",
+                    "test_bench",
+                    "-f",
+                    str(data_file),
+                    "-s",
+                    str(storage_path),
+                ],
+            )
+
+        # Check for regression
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                "check",
+                "test_bench",
+                "-s",
+                str(storage_path),
+            ],
+        )
+        assert result.exit_code == 0
+        assert "No regression detected" in result.stdout
+
+    def test_benchmark_check_with_regression(self, tmp_path: Path) -> None:
+        storage_path = tmp_path / "benchmarks"
+
+        # First run with good data
+        good_file = tmp_path / "good.jsonl"
+        good_file.write_text(
+            '{"candidate": "hello world today", "reference": "hello world today"}'
+        )
+        runner.invoke(
+            app,
+            [
+                "benchmark",
+                "run",
+                "test_bench",
+                "-f",
+                str(good_file),
+                "-s",
+                str(storage_path),
+            ],
+        )
+
+        # Second run with bad data (regression)
+        bad_file = tmp_path / "bad.jsonl"
+        bad_file.write_text(
+            '{"candidate": "completely different", "reference": "hello world today"}'
+        )
+        runner.invoke(
+            app,
+            [
+                "benchmark",
+                "run",
+                "test_bench",
+                "-f",
+                str(bad_file),
+                "-s",
+                str(storage_path),
+            ],
+        )
+
+        # Check for regression
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                "check",
+                "test_bench",
+                "-t",
+                "0.05",
+                "-s",
+                str(storage_path),
+            ],
+        )
+        assert result.exit_code == 1
+        assert "Regression detected" in result.stdout
+
+    def test_benchmark_check_custom_tolerance(self, tmp_path: Path) -> None:
+        data_file = tmp_path / "data.jsonl"
+        data_file.write_text('{"candidate": "hello", "reference": "hello"}')
+        storage_path = tmp_path / "benchmarks"
+
+        runner.invoke(
+            app,
+            [
+                "benchmark",
+                "run",
+                "test_bench",
+                "-f",
+                str(data_file),
+                "-s",
+                str(storage_path),
+            ],
+        )
+
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                "check",
+                "test_bench",
+                "--tolerance",
+                "0.10",
+                "-s",
+                str(storage_path),
+            ],
+        )
+        assert result.exit_code == 0
+        assert "10.00%" in result.stdout
+
+
+class TestBenchmarkHelp:
+    def test_benchmark_help(self) -> None:
+        result = runner.invoke(app, ["benchmark", "--help"])
+        assert result.exit_code == 0
+        assert "run" in result.stdout
+        assert "show" in result.stdout
+        assert "check" in result.stdout
+
+    def test_benchmark_run_help(self) -> None:
+        result = runner.invoke(app, ["benchmark", "run", "--help"])
+        assert result.exit_code == 0
+        assert "--file" in result.stdout
+        assert "--metrics" in result.stdout
+
+    def test_benchmark_show_help(self) -> None:
+        result = runner.invoke(app, ["benchmark", "show", "--help"])
+        assert result.exit_code == 0
+        assert "--last" in result.stdout
+
+    def test_benchmark_check_help(self) -> None:
+        result = runner.invoke(app, ["benchmark", "check", "--help"])
+        assert result.exit_code == 0
+        assert "--tolerance" in result.stdout
+        assert "--window" in result.stdout