diff --git a/tests/test_cli/__init__.py b/tests/test_cli/__init__.py new file mode 100644 index 0000000..36e3c73 --- /dev/null +++ b/tests/test_cli/__init__.py @@ -0,0 +1 @@ +"""CLI test suite.""" diff --git a/tests/test_cli/test_benchmark.py b/tests/test_cli/test_benchmark.py new file mode 100644 index 0000000..e62f42a --- /dev/null +++ b/tests/test_cli/test_benchmark.py @@ -0,0 +1,316 @@ +"""Tests for CLI benchmark commands.""" + +from pathlib import Path + +from typer.testing import CliRunner + +from veritext.cli.main import app + +runner = CliRunner() + + +class TestBenchmarkRun: + def test_benchmark_run_basic(self, tmp_path: Path) -> None: + data_file = tmp_path / "data.jsonl" + data_file.write_text( + '{"candidate": "hello world today", "reference": "hello world today"}\n' + '{"candidate": "foo bar baz qux", "reference": "foo bar baz qux"}' + ) + storage_path = tmp_path / "benchmarks" + + result = runner.invoke( + app, + [ + "benchmark", + "run", + "test_bench", + "-f", + str(data_file), + "-m", + "rouge_l,bleu4", + "-s", + str(storage_path), + ], + ) + assert result.exit_code == 0 + assert "Benchmark 'test_bench' completed" in result.stdout + assert "Samples: 2" in result.stdout + assert "rouge_l:" in result.stdout + assert "bleu4:" in result.stdout + + def test_benchmark_run_file_not_found(self, tmp_path: Path) -> None: + result = runner.invoke( + app, + [ + "benchmark", + "run", + "test_bench", + "-f", + "/nonexistent/file.jsonl", + "-s", + str(tmp_path / "benchmarks"), + ], + ) + assert result.exit_code == 1 + assert "Error" in result.stdout + + def test_benchmark_run_creates_storage(self, tmp_path: Path) -> None: + data_file = tmp_path / "data.jsonl" + data_file.write_text('{"candidate": "hello", "reference": "hello"}') + storage_path = tmp_path / "new_benchmarks" + + result = runner.invoke( + app, + [ + "benchmark", + "run", + "test_bench", + "-f", + str(data_file), + "-s", + str(storage_path), + ], + ) + assert result.exit_code == 0 + assert storage_path.exists() + + +class TestBenchmarkShow: + def test_benchmark_show_no_runs(self, tmp_path: Path) -> None: + storage_path = tmp_path / "benchmarks" + storage_path.mkdir() + + result = runner.invoke( + app, + [ + "benchmark", + "show", + "nonexistent_bench", + "-s", + str(storage_path), + ], + ) + assert result.exit_code == 0 + assert "No benchmark runs found" in result.stdout + + def test_benchmark_show_with_runs(self, tmp_path: Path) -> None: + # First create some runs + data_file = tmp_path / "data.jsonl" + data_file.write_text('{"candidate": "hello world", "reference": "hello world"}') + storage_path = tmp_path / "benchmarks" + + # Run benchmark twice + for _ in range(2): + runner.invoke( + app, + [ + "benchmark", + "run", + "test_bench", + "-f", + str(data_file), + "-s", + str(storage_path), + ], + ) + + # Show history + result = runner.invoke( + app, + [ + "benchmark", + "show", + "test_bench", + "-s", + str(storage_path), + ], + ) + assert result.exit_code == 0 + assert "Benchmark History" in result.stdout + + def test_benchmark_show_limit(self, tmp_path: Path) -> None: + data_file = tmp_path / "data.jsonl" + data_file.write_text('{"candidate": "hello", "reference": "hello"}') + storage_path = tmp_path / "benchmarks" + + # Run benchmark 3 times + for _ in range(3): + runner.invoke( + app, + [ + "benchmark", + "run", + "test_bench", + "-f", + str(data_file), + "-s", + str(storage_path), + ], + ) + + # Show only last 2 + result = runner.invoke( + app, + [ + "benchmark", + "show", + "test_bench", + "--last", + "2", + "-s", + str(storage_path), + ], + ) + assert result.exit_code == 0 + + +class TestBenchmarkCheck: + def test_benchmark_check_no_regression(self, tmp_path: Path) -> None: + data_file = tmp_path / "data.jsonl" + data_file.write_text( + '{"candidate": "hello world today", "reference": "hello world today"}' + ) + storage_path = tmp_path / "benchmarks" + + # Run benchmark twice with same data (no regression) + for _ in range(2): + runner.invoke( + app, + [ + "benchmark", + "run", + "test_bench", + "-f", + str(data_file), + "-s", + str(storage_path), + ], + ) + + # Check for regression + result = runner.invoke( + app, + [ + "benchmark", + "check", + "test_bench", + "-s", + str(storage_path), + ], + ) + assert result.exit_code == 0 + assert "No regression detected" in result.stdout + + def test_benchmark_check_with_regression(self, tmp_path: Path) -> None: + storage_path = tmp_path / "benchmarks" + + # First run with good data + good_file = tmp_path / "good.jsonl" + good_file.write_text( + '{"candidate": "hello world today", "reference": "hello world today"}' + ) + runner.invoke( + app, + [ + "benchmark", + "run", + "test_bench", + "-f", + str(good_file), + "-s", + str(storage_path), + ], + ) + + # Second run with bad data (regression) + bad_file = tmp_path / "bad.jsonl" + bad_file.write_text( + '{"candidate": "completely different", "reference": "hello world today"}' + ) + runner.invoke( + app, + [ + "benchmark", + "run", + "test_bench", + "-f", + str(bad_file), + "-s", + str(storage_path), + ], + ) + + # Check for regression + result = runner.invoke( + app, + [ + "benchmark", + "check", + "test_bench", + "-t", + "0.05", + "-s", + str(storage_path), + ], + ) + assert result.exit_code == 1 + assert "Regression detected" in result.stdout + + def test_benchmark_check_custom_tolerance(self, tmp_path: Path) -> None: + data_file = tmp_path / "data.jsonl" + data_file.write_text('{"candidate": "hello", "reference": "hello"}') + storage_path = tmp_path / "benchmarks" + + runner.invoke( + app, + [ + "benchmark", + "run", + "test_bench", + "-f", + str(data_file), + "-s", + str(storage_path), + ], + ) + + result = runner.invoke( + app, + [ + "benchmark", + "check", + "test_bench", + "--tolerance", + "0.10", + "-s", + str(storage_path), + ], + ) + assert result.exit_code == 0 + assert "10.00%" in result.stdout + + +class TestBenchmarkHelp: + def test_benchmark_help(self) -> None: + result = runner.invoke(app, ["benchmark", "--help"]) + assert result.exit_code == 0 + assert "run" in result.stdout + assert "show" in result.stdout + assert "check" in result.stdout + + def test_benchmark_run_help(self) -> None: + result = runner.invoke(app, ["benchmark", "run", "--help"]) + assert result.exit_code == 0 + assert "--file" in result.stdout + assert "--metrics" in result.stdout + + def test_benchmark_show_help(self) -> None: + result = runner.invoke(app, ["benchmark", "show", "--help"]) + assert result.exit_code == 0 + assert "--last" in result.stdout + + def test_benchmark_check_help(self) -> None: + result = runner.invoke(app, ["benchmark", "check", "--help"]) + assert result.exit_code == 0 + assert "--tolerance" in result.stdout + assert "--window" in result.stdout diff --git a/tests/test_cli/test_formatters.py b/tests/test_cli/test_formatters.py new file mode 100644 index 0000000..3b50855 --- /dev/null +++ b/tests/test_cli/test_formatters.py @@ -0,0 +1,118 @@ +"""Tests for CLI output formatters.""" + +from datetime import UTC, datetime + +from veritext.benchmark.models import BenchmarkRun, RegressionReport +from veritext.cli.formatters import ( + format_benchmark_history, + format_regression_report, + format_validation_json, + format_validation_simple, + format_validation_table, +) + + +class TestFormatValidationTable: + def test_format_empty_results(self) -> None: + table = format_validation_table({}) + assert table.title == "Validation Results" + assert table.row_count == 0 + + def test_format_single_metric(self) -> None: + results = {"bleu4": 0.8523} + table = format_validation_table(results) + assert table.row_count == 1 + + def test_format_multiple_metrics(self) -> None: + results = {"bleu4": 0.85, "rouge_l": 0.92, "jaccard": 0.75} + table = format_validation_table(results) + assert table.row_count == 3 + + def test_format_with_threshold(self) -> None: + results = {"bleu4": 0.85, "rouge_l": 0.45} + table = format_validation_table(results, threshold=0.5) + # Should have 3 columns: Metric, Score, Status + assert table.row_count == 2 + + +class TestFormatValidationJson: + def test_format_empty_results(self) -> None: + result = format_validation_json({}) + assert result == "{}" + + def test_format_results(self) -> None: + results = {"bleu4": 0.85, "rouge_l": 0.92} + result = format_validation_json(results) + assert '"bleu4": 0.85' in result + assert '"rouge_l": 0.92' in result + + +class TestFormatValidationSimple: + def test_format_empty_results(self) -> None: + result = format_validation_simple({}) + assert result == "" + + def test_format_results(self) -> None: + results = {"bleu4": 0.8523, "rouge_l": 0.9234} + result = format_validation_simple(results) + assert "bleu4: 0.8523" in result + assert "rouge_l: 0.9234" in result + + +class TestFormatBenchmarkHistory: + def test_format_empty_history(self) -> None: + table = format_benchmark_history([]) + assert table.title == "Benchmark History" + + def test_format_single_run(self) -> None: + run = BenchmarkRun( + id="test-id", + benchmark_name="test", + timestamp=datetime(2024, 1, 15, 10, 30, tzinfo=UTC), + veritext_version="0.1.0", + metrics={"rouge_l": 0.85, "bleu4": 0.72}, + sample_count=100, + ) + table = format_benchmark_history([run]) + assert table.row_count == 1 + + def test_format_multiple_runs(self) -> None: + runs = [ + BenchmarkRun( + id=f"test-id-{i}", + benchmark_name="test", + timestamp=datetime(2024, 1, i + 1, 10, 30, tzinfo=UTC), + veritext_version="0.1.0", + metrics={"rouge_l": 0.8 + i * 0.01}, + sample_count=100, + ) + for i in range(3) + ] + table = format_benchmark_history(runs) + assert table.row_count == 3 + + +class TestFormatRegressionReport: + def test_format_no_regression(self) -> None: + report = RegressionReport( + detected=False, + baseline={"rouge_l": 0.85}, + current={"rouge_l": 0.86}, + deltas={"rouge_l": 0.01}, + tolerance=0.05, + ) + panel = format_regression_report(report) + assert panel.title == "Regression Check" + assert panel.border_style == "green" + + def test_format_with_regression(self) -> None: + report = RegressionReport( + detected=True, + baseline={"rouge_l": 0.85, "bleu4": 0.72}, + current={"rouge_l": 0.70, "bleu4": 0.70}, + deltas={"rouge_l": -0.15, "bleu4": -0.02}, + tolerance=0.05, + ) + panel = format_regression_report(report) + assert panel.title == "Regression Check" + assert panel.border_style == "red" diff --git a/tests/test_cli/test_readers.py b/tests/test_cli/test_readers.py new file mode 100644 index 0000000..48a30de --- /dev/null +++ b/tests/test_cli/test_readers.py @@ -0,0 +1,126 @@ +"""Tests for CLI input readers.""" + +import json +from pathlib import Path + +import pytest + +from veritext.cli.readers import TextPair, read_jsonl, read_paired_jsonl + + +class TestTextPair: + def test_create_text_pair(self) -> None: + pair = TextPair(candidate="hello", reference="world") + assert pair.candidate == "hello" + assert pair.reference == "world" + + +class TestReadJsonl: + def test_read_valid_jsonl(self, tmp_path: Path) -> None: + data = [ + {"candidate": "foo", "reference": "bar"}, + {"candidate": "baz", "reference": "qux"}, + ] + jsonl_file = tmp_path / "data.jsonl" + jsonl_file.write_text("\n".join(json.dumps(d) for d in data)) + + pairs = read_jsonl(jsonl_file) + + assert len(pairs) == 2 + assert pairs[0].candidate == "foo" + assert pairs[0].reference == "bar" + assert pairs[1].candidate == "baz" + assert pairs[1].reference == "qux" + + def test_read_empty_file(self, tmp_path: Path) -> None: + jsonl_file = tmp_path / "empty.jsonl" + jsonl_file.write_text("") + + pairs = read_jsonl(jsonl_file) + + assert pairs == [] + + def test_read_file_with_blank_lines(self, tmp_path: Path) -> None: + jsonl_file = tmp_path / "data.jsonl" + content = '{"candidate": "a", "reference": "b"}\n\n{"candidate": "c", "reference": "d"}\n' + jsonl_file.write_text(content) + + pairs = read_jsonl(jsonl_file) + + assert len(pairs) == 2 + + def test_read_file_not_found(self, tmp_path: Path) -> None: + with pytest.raises(FileNotFoundError): + read_jsonl(tmp_path / "nonexistent.jsonl") + + def test_read_invalid_json(self, tmp_path: Path) -> None: + jsonl_file = tmp_path / "invalid.jsonl" + jsonl_file.write_text("not valid json") + + with pytest.raises(ValueError, match="Invalid JSON on line 1"): + read_jsonl(jsonl_file) + + def test_read_missing_candidate_key(self, tmp_path: Path) -> None: + jsonl_file = tmp_path / "data.jsonl" + jsonl_file.write_text('{"reference": "bar"}') + + with pytest.raises(ValueError, match="Missing 'candidate' key on line 1"): + read_jsonl(jsonl_file) + + def test_read_missing_reference_key(self, tmp_path: Path) -> None: + jsonl_file = tmp_path / "data.jsonl" + jsonl_file.write_text('{"candidate": "foo"}') + + with pytest.raises(ValueError, match="Missing 'reference' key on line 1"): + read_jsonl(jsonl_file) + + +class TestReadPairedJsonl: + def test_read_paired_valid(self, tmp_path: Path) -> None: + candidates_file = tmp_path / "candidates.jsonl" + references_file = tmp_path / "references.jsonl" + + candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}') + references_file.write_text('{"text": "baz"}\n{"text": "qux"}') + + pairs = read_paired_jsonl(candidates_file, references_file) + + assert len(pairs) == 2 + assert pairs[0].candidate == "foo" + assert pairs[0].reference == "baz" + assert pairs[1].candidate == "bar" + assert pairs[1].reference == "qux" + + def test_read_paired_length_mismatch(self, tmp_path: Path) -> None: + candidates_file = tmp_path / "candidates.jsonl" + references_file = tmp_path / "references.jsonl" + + candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}') + references_file.write_text('{"text": "baz"}') + + with pytest.raises(ValueError, match="does not match"): + read_paired_jsonl(candidates_file, references_file) + + def test_read_paired_candidates_not_found(self, tmp_path: Path) -> None: + references_file = tmp_path / "references.jsonl" + references_file.write_text('{"text": "baz"}') + + with pytest.raises(FileNotFoundError, match="Candidates file not found"): + read_paired_jsonl(tmp_path / "nonexistent.jsonl", references_file) + + def test_read_paired_references_not_found(self, tmp_path: Path) -> None: + candidates_file = tmp_path / "candidates.jsonl" + candidates_file.write_text('{"text": "foo"}') + + with pytest.raises(FileNotFoundError, match="References file not found"): + read_paired_jsonl(candidates_file, tmp_path / "nonexistent.jsonl") + + def test_read_paired_missing_text_key(self, tmp_path: Path) -> None: + candidates_file = tmp_path / "candidates.jsonl" + references_file = tmp_path / "references.jsonl" + + candidates_file.write_text('{"value": "foo"}') + references_file.write_text('{"text": "baz"}') + + with pytest.raises(ValueError, match="Missing 'text' key in candidates file"): + read_paired_jsonl(candidates_file, references_file) diff --git a/tests/test_cli/test_validate.py b/tests/test_cli/test_validate.py new file mode 100644 index 0000000..1a9d6e4 --- /dev/null +++ b/tests/test_cli/test_validate.py @@ -0,0 +1,214 @@ +"""Tests for CLI validate command.""" + +import json +from pathlib import Path + +from typer.testing import CliRunner + +from veritext.cli.main import app + +runner = CliRunner() + + +class TestValidateInline: + def test_validate_inline_basic(self) -> None: + result = runner.invoke( + app, + [ + "validate", + "The quick brown fox jumps", + "-r", + "The quick brown fox jumps", + "-m", + "bleu", + ], + ) + assert result.exit_code == 0 + assert "bleu4" in result.stdout + + def test_validate_inline_with_rouge(self) -> None: + result = runner.invoke( + app, + [ + "validate", + "hello world today", + "-r", + "hello world here", + "-m", + "rouge", + ], + ) + assert result.exit_code == 0 + assert "rouge_l" in result.stdout + + def test_validate_inline_with_lexical(self) -> None: + result = runner.invoke( + app, + [ + "validate", + "hello world", + "-r", + "hello everyone", + "-m", + "lexical", + ], + ) + assert result.exit_code == 0 + assert "jaccard" in result.stdout + assert "token_overlap" in result.stdout + + def test_validate_inline_json_output(self) -> None: + result = runner.invoke( + app, + [ + "validate", + "hello world today", + "-r", + "hello world today", + "-m", + "bleu", + "-o", + "json", + ], + ) + assert result.exit_code == 0 + data = json.loads(result.stdout) + assert "bleu4" in data + + def test_validate_inline_simple_output(self) -> None: + result = runner.invoke( + app, + [ + "validate", + "hello world today", + "-r", + "hello world today", + "-m", + "rouge", + "-o", + "simple", + ], + ) + assert result.exit_code == 0 + assert "rouge_l:" in result.stdout + + def test_validate_inline_missing_reference(self) -> None: + result = runner.invoke( + app, + ["validate", "hello world", "-m", "bleu"], + ) + assert result.exit_code == 1 + assert "Error" in result.stdout + + def test_validate_inline_invalid_metric(self) -> None: + result = runner.invoke( + app, + ["validate", "hello", "-r", "world", "-m", "invalid_metric"], + ) + assert result.exit_code == 1 + assert "Unknown metrics" in result.stdout + + +class TestValidateFile: + def test_validate_file_basic(self, tmp_path: Path) -> None: + data_file = tmp_path / "data.jsonl" + data_file.write_text( + '{"candidate": "hello world today", "reference": "hello world today"}\n' + '{"candidate": "foo bar baz", "reference": "foo bar baz"}' + ) + + result = runner.invoke( + app, + ["validate", "-f", str(data_file), "-m", "bleu"], + ) + assert result.exit_code == 0 + assert "bleu4" in result.stdout + assert "Evaluated 2 text pairs" in result.stdout + + def test_validate_file_not_found(self) -> None: + result = runner.invoke( + app, + ["validate", "-f", "/nonexistent/file.jsonl", "-m", "bleu"], + ) + assert result.exit_code == 1 + assert "Error" in result.stdout + + def test_validate_paired_files(self, tmp_path: Path) -> None: + candidates_file = tmp_path / "candidates.jsonl" + references_file = tmp_path / "references.jsonl" + + candidates_file.write_text( + '{"text": "hello world today"}\n{"text": "foo bar baz"}' + ) + references_file.write_text( + '{"text": "hello world today"}\n{"text": "foo bar baz"}' + ) + + result = runner.invoke( + app, + [ + "validate", + "-f", + str(candidates_file), + "-R", + str(references_file), + "-m", + "bleu", + ], + ) + assert result.exit_code == 0 + assert "Evaluated 2 text pairs" in result.stdout + + +class TestValidateOptions: + def test_validate_with_threshold(self) -> None: + result = runner.invoke( + app, + [ + "validate", + "hello world today", + "-r", + "hello world today", + "-m", + "bleu", + "-t", + "0.5", + ], + ) + assert result.exit_code == 0 + # Table output should include Status column + assert "Status" in result.stdout or "PASS" in result.stdout + + def test_validate_invalid_output_format(self) -> None: + result = runner.invoke( + app, + [ + "validate", + "hello", + "-r", + "world", + "-m", + "bleu", + "-o", + "invalid", + ], + ) + assert result.exit_code == 1 + assert "Invalid output format" in result.stdout + + def test_validate_multiple_metrics(self) -> None: + result = runner.invoke( + app, + [ + "validate", + "The quick brown fox", + "-r", + "The quick brown fox", + "-m", + "bleu,rouge,lexical", + ], + ) + assert result.exit_code == 0 + assert "bleu4" in result.stdout + assert "rouge_l" in result.stdout + assert "jaccard" in result.stdout