cli tests

Add comprehensive test suite for validate command, benchmark commands, input readers, and output formatters using Typer CliRunner.
2025-05-11 14:13:30 +00:00
parent 5f619a626b
commit 8511594697
5 changed files with 775 additions and 0 deletions
@@ -0,0 +1 @@
 """CLI test suite."""
@@ -0,0 +1,316 @@
 """Tests for CLI benchmark commands."""
 from pathlib import Path
 from typer.testing import CliRunner
 from veritext.cli.main import app
 runner = CliRunner()
 class TestBenchmarkRun:
    def test_benchmark_run_basic(self, tmp_path: Path) -> None:
        data_file = tmp_path / "data.jsonl"
        data_file.write_text(
            '{"candidate": "hello world today", "reference": "hello world today"}\n'
            '{"candidate": "foo bar baz qux", "reference": "foo bar baz qux"}'
        )
        storage_path = tmp_path / "benchmarks"
        result = runner.invoke(
            app,
            [
                "benchmark",
                "run",
                "test_bench",
                "-f",
                str(data_file),
                "-m",
                "rouge_l,bleu4",
                "-s",
                str(storage_path),
            ],
        )
        assert result.exit_code == 0
        assert "Benchmark 'test_bench' completed" in result.stdout
        assert "Samples: 2" in result.stdout
        assert "rouge_l:" in result.stdout
        assert "bleu4:" in result.stdout
    def test_benchmark_run_file_not_found(self, tmp_path: Path) -> None:
        result = runner.invoke(
            app,
            [
                "benchmark",
                "run",
                "test_bench",
                "-f",
                "/nonexistent/file.jsonl",
                "-s",
                str(tmp_path / "benchmarks"),
            ],
        )
        assert result.exit_code == 1
        assert "Error" in result.stdout
    def test_benchmark_run_creates_storage(self, tmp_path: Path) -> None:
        data_file = tmp_path / "data.jsonl"
        data_file.write_text('{"candidate": "hello", "reference": "hello"}')
        storage_path = tmp_path / "new_benchmarks"
        result = runner.invoke(
            app,
            [
                "benchmark",
                "run",
                "test_bench",
                "-f",
                str(data_file),
                "-s",
                str(storage_path),
            ],
        )
        assert result.exit_code == 0
        assert storage_path.exists()
 class TestBenchmarkShow:
    def test_benchmark_show_no_runs(self, tmp_path: Path) -> None:
        storage_path = tmp_path / "benchmarks"
        storage_path.mkdir()
        result = runner.invoke(
            app,
            [
                "benchmark",
                "show",
                "nonexistent_bench",
                "-s",
                str(storage_path),
            ],
        )
        assert result.exit_code == 0
        assert "No benchmark runs found" in result.stdout
    def test_benchmark_show_with_runs(self, tmp_path: Path) -> None:
        # First create some runs
        data_file = tmp_path / "data.jsonl"
        data_file.write_text('{"candidate": "hello world", "reference": "hello world"}')
        storage_path = tmp_path / "benchmarks"
        # Run benchmark twice
        for _ in range(2):
            runner.invoke(
                app,
                [
                    "benchmark",
                    "run",
                    "test_bench",
                    "-f",
                    str(data_file),
                    "-s",
                    str(storage_path),
                ],
            )
        # Show history
        result = runner.invoke(
            app,
            [
                "benchmark",
                "show",
                "test_bench",
                "-s",
                str(storage_path),
            ],
        )
        assert result.exit_code == 0
        assert "Benchmark History" in result.stdout
    def test_benchmark_show_limit(self, tmp_path: Path) -> None:
        data_file = tmp_path / "data.jsonl"
        data_file.write_text('{"candidate": "hello", "reference": "hello"}')
        storage_path = tmp_path / "benchmarks"
        # Run benchmark 3 times
        for _ in range(3):
            runner.invoke(
                app,
                [
                    "benchmark",
                    "run",
                    "test_bench",
                    "-f",
                    str(data_file),
                    "-s",
                    str(storage_path),
                ],
            )
        # Show only last 2
        result = runner.invoke(
            app,
            [
                "benchmark",
                "show",
                "test_bench",
                "--last",
                "2",
                "-s",
                str(storage_path),
            ],
        )
        assert result.exit_code == 0
 class TestBenchmarkCheck:
    def test_benchmark_check_no_regression(self, tmp_path: Path) -> None:
        data_file = tmp_path / "data.jsonl"
        data_file.write_text(
            '{"candidate": "hello world today", "reference": "hello world today"}'
        )
        storage_path = tmp_path / "benchmarks"
        # Run benchmark twice with same data (no regression)
        for _ in range(2):
            runner.invoke(
                app,
                [
                    "benchmark",
                    "run",
                    "test_bench",
                    "-f",
                    str(data_file),
                    "-s",
                    str(storage_path),
                ],
            )
        # Check for regression
        result = runner.invoke(
            app,
            [
                "benchmark",
                "check",
                "test_bench",
                "-s",
                str(storage_path),
            ],
        )
        assert result.exit_code == 0
        assert "No regression detected" in result.stdout
    def test_benchmark_check_with_regression(self, tmp_path: Path) -> None:
        storage_path = tmp_path / "benchmarks"
        # First run with good data
        good_file = tmp_path / "good.jsonl"
        good_file.write_text(
            '{"candidate": "hello world today", "reference": "hello world today"}'
        )
        runner.invoke(
            app,
            [
                "benchmark",
                "run",
                "test_bench",
                "-f",
                str(good_file),
                "-s",
                str(storage_path),
            ],
        )
        # Second run with bad data (regression)
        bad_file = tmp_path / "bad.jsonl"
        bad_file.write_text(
            '{"candidate": "completely different", "reference": "hello world today"}'
        )
        runner.invoke(
            app,
            [
                "benchmark",
                "run",
                "test_bench",
                "-f",
                str(bad_file),
                "-s",
                str(storage_path),
            ],
        )
        # Check for regression
        result = runner.invoke(
            app,
            [
                "benchmark",
                "check",
                "test_bench",
                "-t",
                "0.05",
                "-s",
                str(storage_path),
            ],
        )
        assert result.exit_code == 1
        assert "Regression detected" in result.stdout
    def test_benchmark_check_custom_tolerance(self, tmp_path: Path) -> None:
        data_file = tmp_path / "data.jsonl"
        data_file.write_text('{"candidate": "hello", "reference": "hello"}')
        storage_path = tmp_path / "benchmarks"
        runner.invoke(
            app,
            [
                "benchmark",
                "run",
                "test_bench",
                "-f",
                str(data_file),
                "-s",
                str(storage_path),
            ],
        )
        result = runner.invoke(
            app,
            [
                "benchmark",
                "check",
                "test_bench",
                "--tolerance",
                "0.10",
                "-s",
                str(storage_path),
            ],
        )
        assert result.exit_code == 0
        assert "10.00%" in result.stdout
 class TestBenchmarkHelp:
    def test_benchmark_help(self) -> None:
        result = runner.invoke(app, ["benchmark", "--help"])
        assert result.exit_code == 0
        assert "run" in result.stdout
        assert "show" in result.stdout
        assert "check" in result.stdout
    def test_benchmark_run_help(self) -> None:
        result = runner.invoke(app, ["benchmark", "run", "--help"])
        assert result.exit_code == 0
        assert "--file" in result.stdout
        assert "--metrics" in result.stdout
    def test_benchmark_show_help(self) -> None:
        result = runner.invoke(app, ["benchmark", "show", "--help"])
        assert result.exit_code == 0
        assert "--last" in result.stdout
    def test_benchmark_check_help(self) -> None:
        result = runner.invoke(app, ["benchmark", "check", "--help"])
        assert result.exit_code == 0
        assert "--tolerance" in result.stdout
        assert "--window" in result.stdout
@@ -0,0 +1,118 @@
 """Tests for CLI output formatters."""
 from datetime import UTC, datetime
 from veritext.benchmark.models import BenchmarkRun, RegressionReport
 from veritext.cli.formatters import (
    format_benchmark_history,
    format_regression_report,
    format_validation_json,
    format_validation_simple,
    format_validation_table,
 )
 class TestFormatValidationTable:
    def test_format_empty_results(self) -> None:
        table = format_validation_table({})
        assert table.title == "Validation Results"
        assert table.row_count == 0
    def test_format_single_metric(self) -> None:
        results = {"bleu4": 0.8523}
        table = format_validation_table(results)
        assert table.row_count == 1
    def test_format_multiple_metrics(self) -> None:
        results = {"bleu4": 0.85, "rouge_l": 0.92, "jaccard": 0.75}
        table = format_validation_table(results)
        assert table.row_count == 3
    def test_format_with_threshold(self) -> None:
        results = {"bleu4": 0.85, "rouge_l": 0.45}
        table = format_validation_table(results, threshold=0.5)
        # Should have 3 columns: Metric, Score, Status
        assert table.row_count == 2
 class TestFormatValidationJson:
    def test_format_empty_results(self) -> None:
        result = format_validation_json({})
        assert result == "{}"
    def test_format_results(self) -> None:
        results = {"bleu4": 0.85, "rouge_l": 0.92}
        result = format_validation_json(results)
        assert '"bleu4": 0.85' in result
        assert '"rouge_l": 0.92' in result
 class TestFormatValidationSimple:
    def test_format_empty_results(self) -> None:
        result = format_validation_simple({})
        assert result == ""
    def test_format_results(self) -> None:
        results = {"bleu4": 0.8523, "rouge_l": 0.9234}
        result = format_validation_simple(results)
        assert "bleu4: 0.8523" in result
        assert "rouge_l: 0.9234" in result
 class TestFormatBenchmarkHistory:
    def test_format_empty_history(self) -> None:
        table = format_benchmark_history([])
        assert table.title == "Benchmark History"
    def test_format_single_run(self) -> None:
        run = BenchmarkRun(
            id="test-id",
            benchmark_name="test",
            timestamp=datetime(2024, 1, 15, 10, 30, tzinfo=UTC),
            veritext_version="0.1.0",
            metrics={"rouge_l": 0.85, "bleu4": 0.72},
            sample_count=100,
        )
        table = format_benchmark_history([run])
        assert table.row_count == 1
    def test_format_multiple_runs(self) -> None:
        runs = [
            BenchmarkRun(
                id=f"test-id-{i}",
                benchmark_name="test",
                timestamp=datetime(2024, 1, i + 1, 10, 30, tzinfo=UTC),
                veritext_version="0.1.0",
                metrics={"rouge_l": 0.8 + i * 0.01},
                sample_count=100,
            )
            for i in range(3)
        ]
        table = format_benchmark_history(runs)
        assert table.row_count == 3
 class TestFormatRegressionReport:
    def test_format_no_regression(self) -> None:
        report = RegressionReport(
            detected=False,
            baseline={"rouge_l": 0.85},
            current={"rouge_l": 0.86},
            deltas={"rouge_l": 0.01},
            tolerance=0.05,
        )
        panel = format_regression_report(report)
        assert panel.title == "Regression Check"
        assert panel.border_style == "green"
    def test_format_with_regression(self) -> None:
        report = RegressionReport(
            detected=True,
            baseline={"rouge_l": 0.85, "bleu4": 0.72},
            current={"rouge_l": 0.70, "bleu4": 0.70},
            deltas={"rouge_l": -0.15, "bleu4": -0.02},
            tolerance=0.05,
        )
        panel = format_regression_report(report)
        assert panel.title == "Regression Check"
        assert panel.border_style == "red"
@@ -0,0 +1,126 @@
 """Tests for CLI input readers."""
 import json
 from pathlib import Path
 import pytest
 from veritext.cli.readers import TextPair, read_jsonl, read_paired_jsonl
 class TestTextPair:
    def test_create_text_pair(self) -> None:
        pair = TextPair(candidate="hello", reference="world")
        assert pair.candidate == "hello"
        assert pair.reference == "world"
 class TestReadJsonl:
    def test_read_valid_jsonl(self, tmp_path: Path) -> None:
        data = [
            {"candidate": "foo", "reference": "bar"},
            {"candidate": "baz", "reference": "qux"},
        ]
        jsonl_file = tmp_path / "data.jsonl"
        jsonl_file.write_text("\n".join(json.dumps(d) for d in data))
        pairs = read_jsonl(jsonl_file)
        assert len(pairs) == 2
        assert pairs[0].candidate == "foo"
        assert pairs[0].reference == "bar"
        assert pairs[1].candidate == "baz"
        assert pairs[1].reference == "qux"
    def test_read_empty_file(self, tmp_path: Path) -> None:
        jsonl_file = tmp_path / "empty.jsonl"
        jsonl_file.write_text("")
        pairs = read_jsonl(jsonl_file)
        assert pairs == []
    def test_read_file_with_blank_lines(self, tmp_path: Path) -> None:
        jsonl_file = tmp_path / "data.jsonl"
        content = '{"candidate": "a", "reference": "b"}\n\n{"candidate": "c", "reference": "d"}\n'
        jsonl_file.write_text(content)
        pairs = read_jsonl(jsonl_file)
        assert len(pairs) == 2
    def test_read_file_not_found(self, tmp_path: Path) -> None:
        with pytest.raises(FileNotFoundError):
            read_jsonl(tmp_path / "nonexistent.jsonl")
    def test_read_invalid_json(self, tmp_path: Path) -> None:
        jsonl_file = tmp_path / "invalid.jsonl"
        jsonl_file.write_text("not valid json")
        with pytest.raises(ValueError, match="Invalid JSON on line 1"):
            read_jsonl(jsonl_file)
    def test_read_missing_candidate_key(self, tmp_path: Path) -> None:
        jsonl_file = tmp_path / "data.jsonl"
        jsonl_file.write_text('{"reference": "bar"}')
        with pytest.raises(ValueError, match="Missing 'candidate' key on line 1"):
            read_jsonl(jsonl_file)
    def test_read_missing_reference_key(self, tmp_path: Path) -> None:
        jsonl_file = tmp_path / "data.jsonl"
        jsonl_file.write_text('{"candidate": "foo"}')
        with pytest.raises(ValueError, match="Missing 'reference' key on line 1"):
            read_jsonl(jsonl_file)
 class TestReadPairedJsonl:
    def test_read_paired_valid(self, tmp_path: Path) -> None:
        candidates_file = tmp_path / "candidates.jsonl"
        references_file = tmp_path / "references.jsonl"
        candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}')
        references_file.write_text('{"text": "baz"}\n{"text": "qux"}')
        pairs = read_paired_jsonl(candidates_file, references_file)
        assert len(pairs) == 2
        assert pairs[0].candidate == "foo"
        assert pairs[0].reference == "baz"
        assert pairs[1].candidate == "bar"
        assert pairs[1].reference == "qux"
    def test_read_paired_length_mismatch(self, tmp_path: Path) -> None:
        candidates_file = tmp_path / "candidates.jsonl"
        references_file = tmp_path / "references.jsonl"
        candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}')
        references_file.write_text('{"text": "baz"}')
        with pytest.raises(ValueError, match="does not match"):
            read_paired_jsonl(candidates_file, references_file)
    def test_read_paired_candidates_not_found(self, tmp_path: Path) -> None:
        references_file = tmp_path / "references.jsonl"
        references_file.write_text('{"text": "baz"}')
        with pytest.raises(FileNotFoundError, match="Candidates file not found"):
            read_paired_jsonl(tmp_path / "nonexistent.jsonl", references_file)
    def test_read_paired_references_not_found(self, tmp_path: Path) -> None:
        candidates_file = tmp_path / "candidates.jsonl"
        candidates_file.write_text('{"text": "foo"}')
        with pytest.raises(FileNotFoundError, match="References file not found"):
            read_paired_jsonl(candidates_file, tmp_path / "nonexistent.jsonl")
    def test_read_paired_missing_text_key(self, tmp_path: Path) -> None:
        candidates_file = tmp_path / "candidates.jsonl"
        references_file = tmp_path / "references.jsonl"
        candidates_file.write_text('{"value": "foo"}')
        references_file.write_text('{"text": "baz"}')
        with pytest.raises(ValueError, match="Missing 'text' key in candidates file"):
            read_paired_jsonl(candidates_file, references_file)
@@ -0,0 +1,214 @@
 """Tests for CLI validate command."""
 import json
 from pathlib import Path
 from typer.testing import CliRunner
 from veritext.cli.main import app
 runner = CliRunner()
 class TestValidateInline:
    def test_validate_inline_basic(self) -> None:
        result = runner.invoke(
            app,
            [
                "validate",
                "The quick brown fox jumps",
                "-r",
                "The quick brown fox jumps",
                "-m",
                "bleu",
            ],
        )
        assert result.exit_code == 0
        assert "bleu4" in result.stdout
    def test_validate_inline_with_rouge(self) -> None:
        result = runner.invoke(
            app,
            [
                "validate",
                "hello world today",
                "-r",
                "hello world here",
                "-m",
                "rouge",
            ],
        )
        assert result.exit_code == 0
        assert "rouge_l" in result.stdout
    def test_validate_inline_with_lexical(self) -> None:
        result = runner.invoke(
            app,
            [
                "validate",
                "hello world",
                "-r",
                "hello everyone",
                "-m",
                "lexical",
            ],
        )
        assert result.exit_code == 0
        assert "jaccard" in result.stdout
        assert "token_overlap" in result.stdout
    def test_validate_inline_json_output(self) -> None:
        result = runner.invoke(
            app,
            [
                "validate",
                "hello world today",
                "-r",
                "hello world today",
                "-m",
                "bleu",
                "-o",
                "json",
            ],
        )
        assert result.exit_code == 0
        data = json.loads(result.stdout)
        assert "bleu4" in data
    def test_validate_inline_simple_output(self) -> None:
        result = runner.invoke(
            app,
            [
                "validate",
                "hello world today",
                "-r",
                "hello world today",
                "-m",
                "rouge",
                "-o",
                "simple",
            ],
        )
        assert result.exit_code == 0
        assert "rouge_l:" in result.stdout
    def test_validate_inline_missing_reference(self) -> None:
        result = runner.invoke(
            app,
            ["validate", "hello world", "-m", "bleu"],
        )
        assert result.exit_code == 1
        assert "Error" in result.stdout
    def test_validate_inline_invalid_metric(self) -> None:
        result = runner.invoke(
            app,
            ["validate", "hello", "-r", "world", "-m", "invalid_metric"],
        )
        assert result.exit_code == 1
        assert "Unknown metrics" in result.stdout
 class TestValidateFile:
    def test_validate_file_basic(self, tmp_path: Path) -> None:
        data_file = tmp_path / "data.jsonl"
        data_file.write_text(
            '{"candidate": "hello world today", "reference": "hello world today"}\n'
            '{"candidate": "foo bar baz", "reference": "foo bar baz"}'
        )
        result = runner.invoke(
            app,
            ["validate", "-f", str(data_file), "-m", "bleu"],
        )
        assert result.exit_code == 0
        assert "bleu4" in result.stdout
        assert "Evaluated 2 text pairs" in result.stdout
    def test_validate_file_not_found(self) -> None:
        result = runner.invoke(
            app,
            ["validate", "-f", "/nonexistent/file.jsonl", "-m", "bleu"],
        )
        assert result.exit_code == 1
        assert "Error" in result.stdout
    def test_validate_paired_files(self, tmp_path: Path) -> None:
        candidates_file = tmp_path / "candidates.jsonl"
        references_file = tmp_path / "references.jsonl"
        candidates_file.write_text(
            '{"text": "hello world today"}\n{"text": "foo bar baz"}'
        )
        references_file.write_text(
            '{"text": "hello world today"}\n{"text": "foo bar baz"}'
        )
        result = runner.invoke(
            app,
            [
                "validate",
                "-f",
                str(candidates_file),
                "-R",
                str(references_file),
                "-m",
                "bleu",
            ],
        )
        assert result.exit_code == 0
        assert "Evaluated 2 text pairs" in result.stdout
 class TestValidateOptions:
    def test_validate_with_threshold(self) -> None:
        result = runner.invoke(
            app,
            [
                "validate",
                "hello world today",
                "-r",
                "hello world today",
                "-m",
                "bleu",
                "-t",
                "0.5",
            ],
        )
        assert result.exit_code == 0
        # Table output should include Status column
        assert "Status" in result.stdout or "PASS" in result.stdout
    def test_validate_invalid_output_format(self) -> None:
        result = runner.invoke(
            app,
            [
                "validate",
                "hello",
                "-r",
                "world",
                "-m",
                "bleu",
                "-o",
                "invalid",
            ],
        )
        assert result.exit_code == 1
        assert "Invalid output format" in result.stdout
    def test_validate_multiple_metrics(self) -> None:
        result = runner.invoke(
            app,
            [
                "validate",
                "The quick brown fox",
                "-r",
                "The quick brown fox",
                "-m",
                "bleu,rouge,lexical",
            ],
        )
        assert result.exit_code == 0
        assert "bleu4" in result.stdout
        assert "rouge_l" in result.stdout
        assert "jaccard" in result.stdout