veritext/tests/test_cli/test_benchmark.py

"""Tests for CLI benchmark commands."""

from pathlib import Path

from typer.testing import CliRunner

from veritext.cli.main import app

runner = CliRunner()


class TestBenchmarkRun:
    def test_benchmark_run_basic(self, tmp_path: Path) -> None:
        data_file = tmp_path / "data.jsonl"
        data_file.write_text(
            '{"candidate": "hello world today", "reference": "hello world today"}\n'
            '{"candidate": "foo bar baz qux", "reference": "foo bar baz qux"}'
        )
        storage_path = tmp_path / "benchmarks"

        result = runner.invoke(
            app,
            [
                "benchmark",
                "run",
                "test_bench",
                "-f",
                str(data_file),
                "-m",
                "rouge_l,bleu4",
                "-s",
                str(storage_path),
            ],
        )
        assert result.exit_code == 0
        assert "Benchmark 'test_bench' completed" in result.stdout
        assert "Samples: 2" in result.stdout
        assert "rouge_l:" in result.stdout
        assert "bleu4:" in result.stdout

    def test_benchmark_run_file_not_found(self, tmp_path: Path) -> None:
        result = runner.invoke(
            app,
            [
                "benchmark",
                "run",
                "test_bench",
                "-f",
                "/nonexistent/file.jsonl",
                "-s",
                str(tmp_path / "benchmarks"),
            ],
        )
        assert result.exit_code == 1
        assert "Error" in result.stdout

    def test_benchmark_run_creates_storage(self, tmp_path: Path) -> None:
        data_file = tmp_path / "data.jsonl"
        data_file.write_text('{"candidate": "hello", "reference": "hello"}')
        storage_path = tmp_path / "new_benchmarks"

        result = runner.invoke(
            app,
            [
                "benchmark",
                "run",
                "test_bench",
                "-f",
                str(data_file),
                "-s",
                str(storage_path),
            ],
        )
        assert result.exit_code == 0
        assert storage_path.exists()


class TestBenchmarkShow:
    def test_benchmark_show_no_runs(self, tmp_path: Path) -> None:
        storage_path = tmp_path / "benchmarks"
        storage_path.mkdir()

        result = runner.invoke(
            app,
            [
                "benchmark",
                "show",
                "nonexistent_bench",
                "-s",
                str(storage_path),
            ],
        )
        assert result.exit_code == 0
        assert "No benchmark runs found" in result.stdout

    def test_benchmark_show_with_runs(self, tmp_path: Path) -> None:
        # First create some runs
        data_file = tmp_path / "data.jsonl"
        data_file.write_text('{"candidate": "hello world", "reference": "hello world"}')
        storage_path = tmp_path / "benchmarks"

        # Run benchmark twice
        for _ in range(2):
            runner.invoke(
                app,
                [
                    "benchmark",
                    "run",
                    "test_bench",
                    "-f",
                    str(data_file),
                    "-s",
                    str(storage_path),
                ],
            )

        # Show history
        result = runner.invoke(
            app,
            [
                "benchmark",
                "show",
                "test_bench",
                "-s",
                str(storage_path),
            ],
        )
        assert result.exit_code == 0
        assert "Benchmark History" in result.stdout

    def test_benchmark_show_limit(self, tmp_path: Path) -> None:
        data_file = tmp_path / "data.jsonl"
        data_file.write_text('{"candidate": "hello", "reference": "hello"}')
        storage_path = tmp_path / "benchmarks"

        # Run benchmark 3 times
        for _ in range(3):
            runner.invoke(
                app,
                [
                    "benchmark",
                    "run",
                    "test_bench",
                    "-f",
                    str(data_file),
                    "-s",
                    str(storage_path),
                ],
            )

        # Show only last 2
        result = runner.invoke(
            app,
            [
                "benchmark",
                "show",
                "test_bench",
                "--last",
                "2",
                "-s",
                str(storage_path),
            ],
        )
        assert result.exit_code == 0


class TestBenchmarkCheck:
    def test_benchmark_check_no_regression(self, tmp_path: Path) -> None:
        data_file = tmp_path / "data.jsonl"
        data_file.write_text(
            '{"candidate": "hello world today", "reference": "hello world today"}'
        )
        storage_path = tmp_path / "benchmarks"

        # Run benchmark twice with same data (no regression)
        for _ in range(2):
            runner.invoke(
                app,
                [
                    "benchmark",
                    "run",
                    "test_bench",
                    "-f",
                    str(data_file),
                    "-s",
                    str(storage_path),
                ],
            )

        # Check for regression
        result = runner.invoke(
            app,
            [
                "benchmark",
                "check",
                "test_bench",
                "-s",
                str(storage_path),
            ],
        )
        assert result.exit_code == 0
        assert "No regression detected" in result.stdout

    def test_benchmark_check_with_regression(self, tmp_path: Path) -> None:
        storage_path = tmp_path / "benchmarks"

        # First run with good data
        good_file = tmp_path / "good.jsonl"
        good_file.write_text(
            '{"candidate": "hello world today", "reference": "hello world today"}'
        )
        runner.invoke(
            app,
            [
                "benchmark",
                "run",
                "test_bench",
                "-f",
                str(good_file),
                "-s",
                str(storage_path),
            ],
        )

        # Second run with bad data (regression)
        bad_file = tmp_path / "bad.jsonl"
        bad_file.write_text(
            '{"candidate": "completely different", "reference": "hello world today"}'
        )
        runner.invoke(
            app,
            [
                "benchmark",
                "run",
                "test_bench",
                "-f",
                str(bad_file),
                "-s",
                str(storage_path),
            ],
        )

        # Check for regression
        result = runner.invoke(
            app,
            [
                "benchmark",
                "check",
                "test_bench",
                "-t",
                "0.05",
                "-s",
                str(storage_path),
            ],
        )
        assert result.exit_code == 1
        assert "Regression detected" in result.stdout

    def test_benchmark_check_custom_tolerance(self, tmp_path: Path) -> None:
        data_file = tmp_path / "data.jsonl"
        data_file.write_text('{"candidate": "hello", "reference": "hello"}')
        storage_path = tmp_path / "benchmarks"

        runner.invoke(
            app,
            [
                "benchmark",
                "run",
                "test_bench",
                "-f",
                str(data_file),
                "-s",
                str(storage_path),
            ],
        )

        result = runner.invoke(
            app,
            [
                "benchmark",
                "check",
                "test_bench",
                "--tolerance",
                "0.10",
                "-s",
                str(storage_path),
            ],
        )
        assert result.exit_code == 0
        assert "10.00%" in result.stdout


class TestBenchmarkHelp:
    def test_benchmark_help(self) -> None:
        result = runner.invoke(app, ["benchmark", "--help"])
        assert result.exit_code == 0
        assert "run" in result.stdout
        assert "show" in result.stdout
        assert "check" in result.stdout

    def test_benchmark_run_help(self) -> None:
        result = runner.invoke(app, ["benchmark", "run", "--help"])
        assert result.exit_code == 0
        assert "--file" in result.stdout
        assert "--metrics" in result.stdout

    def test_benchmark_show_help(self) -> None:
        result = runner.invoke(app, ["benchmark", "show", "--help"])
        assert result.exit_code == 0
        assert "--last" in result.stdout

    def test_benchmark_check_help(self) -> None:
        result = runner.invoke(app, ["benchmark", "check", "--help"])
        assert result.exit_code == 0
        assert "--tolerance" in result.stdout
        assert "--window" in result.stdout