diff --git a/tests/test_cli/__init__.py b/tests/test_cli/__init__.py
new file mode 100644
index 0000000..36e3c73
--- /dev/null
+++ b/tests/test_cli/__init__.py
@@ -0,0 +1 @@
+"""CLI test suite."""
diff --git a/tests/test_cli/test_benchmark.py b/tests/test_cli/test_benchmark.py
new file mode 100644
index 0000000..e62f42a
--- /dev/null
+++ b/tests/test_cli/test_benchmark.py
@@ -0,0 +1,316 @@
+"""Tests for CLI benchmark commands."""
+
+from pathlib import Path
+
+from typer.testing import CliRunner
+
+from veritext.cli.main import app
+
+runner = CliRunner()
+
+
+class TestBenchmarkRun:
+    def test_benchmark_run_basic(self, tmp_path: Path) -> None:
+        data_file = tmp_path / "data.jsonl"
+        data_file.write_text(
+            '{"candidate": "hello world today", "reference": "hello world today"}\n'
+            '{"candidate": "foo bar baz qux", "reference": "foo bar baz qux"}'
+        )
+        storage_path = tmp_path / "benchmarks"
+
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                "run",
+                "test_bench",
+                "-f",
+                str(data_file),
+                "-m",
+                "rouge_l,bleu4",
+                "-s",
+                str(storage_path),
+            ],
+        )
+        assert result.exit_code == 0
+        assert "Benchmark 'test_bench' completed" in result.stdout
+        assert "Samples: 2" in result.stdout
+        assert "rouge_l:" in result.stdout
+        assert "bleu4:" in result.stdout
+
+    def test_benchmark_run_file_not_found(self, tmp_path: Path) -> None:
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                "run",
+                "test_bench",
+                "-f",
+                "/nonexistent/file.jsonl",
+                "-s",
+                str(tmp_path / "benchmarks"),
+            ],
+        )
+        assert result.exit_code == 1
+        assert "Error" in result.stdout
+
+    def test_benchmark_run_creates_storage(self, tmp_path: Path) -> None:
+        data_file = tmp_path / "data.jsonl"
+        data_file.write_text('{"candidate": "hello", "reference": "hello"}')
+        storage_path = tmp_path / "new_benchmarks"
+
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                "run",
+                "test_bench",
+                "-f",
+                str(data_file),
+                "-s",
+                str(storage_path),
+            ],
+        )
+        assert result.exit_code == 0
+        assert storage_path.exists()
+
+
+class TestBenchmarkShow:
+    def test_benchmark_show_no_runs(self, tmp_path: Path) -> None:
+        storage_path = tmp_path / "benchmarks"
+        storage_path.mkdir()
+
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                "show",
+                "nonexistent_bench",
+                "-s",
+                str(storage_path),
+            ],
+        )
+        assert result.exit_code == 0
+        assert "No benchmark runs found" in result.stdout
+
+    def test_benchmark_show_with_runs(self, tmp_path: Path) -> None:
+        # First create some runs
+        data_file = tmp_path / "data.jsonl"
+        data_file.write_text('{"candidate": "hello world", "reference": "hello world"}')
+        storage_path = tmp_path / "benchmarks"
+
+        # Run benchmark twice
+        for _ in range(2):
+            runner.invoke(
+                app,
+                [
+                    "benchmark",
+                    "run",
+                    "test_bench",
+                    "-f",
+                    str(data_file),
+                    "-s",
+                    str(storage_path),
+                ],
+            )
+
+        # Show history
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                "show",
+                "test_bench",
+                "-s",
+                str(storage_path),
+            ],
+        )
+        assert result.exit_code == 0
+        assert "Benchmark History" in result.stdout
+
+    def test_benchmark_show_limit(self, tmp_path: Path) -> None:
+        data_file = tmp_path / "data.jsonl"
+        data_file.write_text('{"candidate": "hello", "reference": "hello"}')
+        storage_path = tmp_path / "benchmarks"
+
+        # Run benchmark 3 times
+        for _ in range(3):
+            runner.invoke(
+                app,
+                [
+                    "benchmark",
+                    "run",
+                    "test_bench",
+                    "-f",
+                    str(data_file),
+                    "-s",
+                    str(storage_path),
+                ],
+            )
+
+        # Show only last 2
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                "show",
+                "test_bench",
+                "--last",
+                "2",
+                "-s",
+                str(storage_path),
+            ],
+        )
+        assert result.exit_code == 0
+
+
+class TestBenchmarkCheck:
+    def test_benchmark_check_no_regression(self, tmp_path: Path) -> None:
+        data_file = tmp_path / "data.jsonl"
+        data_file.write_text(
+            '{"candidate": "hello world today", "reference": "hello world today"}'
+        )
+        storage_path = tmp_path / "benchmarks"
+
+        # Run benchmark twice with same data (no regression)
+        for _ in range(2):
+            runner.invoke(
+                app,
+                [
+                    "benchmark",
+                    "run",
+                    "test_bench",
+                    "-f",
+                    str(data_file),
+                    "-s",
+                    str(storage_path),
+                ],
+            )
+
+        # Check for regression
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                "check",
+                "test_bench",
+                "-s",
+                str(storage_path),
+            ],
+        )
+        assert result.exit_code == 0
+        assert "No regression detected" in result.stdout
+
+    def test_benchmark_check_with_regression(self, tmp_path: Path) -> None:
+        storage_path = tmp_path / "benchmarks"
+
+        # First run with good data
+        good_file = tmp_path / "good.jsonl"
+        good_file.write_text(
+            '{"candidate": "hello world today", "reference": "hello world today"}'
+        )
+        runner.invoke(
+            app,
+            [
+                "benchmark",
+                "run",
+                "test_bench",
+                "-f",
+                str(good_file),
+                "-s",
+                str(storage_path),
+            ],
+        )
+
+        # Second run with bad data (regression)
+        bad_file = tmp_path / "bad.jsonl"
+        bad_file.write_text(
+            '{"candidate": "completely different", "reference": "hello world today"}'
+        )
+        runner.invoke(
+            app,
+            [
+                "benchmark",
+                "run",
+                "test_bench",
+                "-f",
+                str(bad_file),
+                "-s",
+                str(storage_path),
+            ],
+        )
+
+        # Check for regression
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                "check",
+                "test_bench",
+                "-t",
+                "0.05",
+                "-s",
+                str(storage_path),
+            ],
+        )
+        assert result.exit_code == 1
+        assert "Regression detected" in result.stdout
+
+    def test_benchmark_check_custom_tolerance(self, tmp_path: Path) -> None:
+        data_file = tmp_path / "data.jsonl"
+        data_file.write_text('{"candidate": "hello", "reference": "hello"}')
+        storage_path = tmp_path / "benchmarks"
+
+        runner.invoke(
+            app,
+            [
+                "benchmark",
+                "run",
+                "test_bench",
+                "-f",
+                str(data_file),
+                "-s",
+                str(storage_path),
+            ],
+        )
+
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                "check",
+                "test_bench",
+                "--tolerance",
+                "0.10",
+                "-s",
+                str(storage_path),
+            ],
+        )
+        assert result.exit_code == 0
+        assert "10.00%" in result.stdout
+
+
+class TestBenchmarkHelp:
+    def test_benchmark_help(self) -> None:
+        result = runner.invoke(app, ["benchmark", "--help"])
+        assert result.exit_code == 0
+        assert "run" in result.stdout
+        assert "show" in result.stdout
+        assert "check" in result.stdout
+
+    def test_benchmark_run_help(self) -> None:
+        result = runner.invoke(app, ["benchmark", "run", "--help"])
+        assert result.exit_code == 0
+        assert "--file" in result.stdout
+        assert "--metrics" in result.stdout
+
+    def test_benchmark_show_help(self) -> None:
+        result = runner.invoke(app, ["benchmark", "show", "--help"])
+        assert result.exit_code == 0
+        assert "--last" in result.stdout
+
+    def test_benchmark_check_help(self) -> None:
+        result = runner.invoke(app, ["benchmark", "check", "--help"])
+        assert result.exit_code == 0
+        assert "--tolerance" in result.stdout
+        assert "--window" in result.stdout
diff --git a/tests/test_cli/test_formatters.py b/tests/test_cli/test_formatters.py
new file mode 100644
index 0000000..3b50855
--- /dev/null
+++ b/tests/test_cli/test_formatters.py
@@ -0,0 +1,118 @@
+"""Tests for CLI output formatters."""
+
+from datetime import UTC, datetime
+
+from veritext.benchmark.models import BenchmarkRun, RegressionReport
+from veritext.cli.formatters import (
+    format_benchmark_history,
+    format_regression_report,
+    format_validation_json,
+    format_validation_simple,
+    format_validation_table,
+)
+
+
+class TestFormatValidationTable:
+    def test_format_empty_results(self) -> None:
+        table = format_validation_table({})
+        assert table.title == "Validation Results"
+        assert table.row_count == 0
+
+    def test_format_single_metric(self) -> None:
+        results = {"bleu4": 0.8523}
+        table = format_validation_table(results)
+        assert table.row_count == 1
+
+    def test_format_multiple_metrics(self) -> None:
+        results = {"bleu4": 0.85, "rouge_l": 0.92, "jaccard": 0.75}
+        table = format_validation_table(results)
+        assert table.row_count == 3
+
+    def test_format_with_threshold(self) -> None:
+        results = {"bleu4": 0.85, "rouge_l": 0.45}
+        table = format_validation_table(results, threshold=0.5)
+        # Should have 3 columns: Metric, Score, Status
+        assert table.row_count == 2
+
+
+class TestFormatValidationJson:
+    def test_format_empty_results(self) -> None:
+        result = format_validation_json({})
+        assert result == "{}"
+
+    def test_format_results(self) -> None:
+        results = {"bleu4": 0.85, "rouge_l": 0.92}
+        result = format_validation_json(results)
+        assert '"bleu4": 0.85' in result
+        assert '"rouge_l": 0.92' in result
+
+
+class TestFormatValidationSimple:
+    def test_format_empty_results(self) -> None:
+        result = format_validation_simple({})
+        assert result == ""
+
+    def test_format_results(self) -> None:
+        results = {"bleu4": 0.8523, "rouge_l": 0.9234}
+        result = format_validation_simple(results)
+        assert "bleu4: 0.8523" in result
+        assert "rouge_l: 0.9234" in result
+
+
+class TestFormatBenchmarkHistory:
+    def test_format_empty_history(self) -> None:
+        table = format_benchmark_history([])
+        assert table.title == "Benchmark History"
+
+    def test_format_single_run(self) -> None:
+        run = BenchmarkRun(
+            id="test-id",
+            benchmark_name="test",
+            timestamp=datetime(2024, 1, 15, 10, 30, tzinfo=UTC),
+            veritext_version="0.1.0",
+            metrics={"rouge_l": 0.85, "bleu4": 0.72},
+            sample_count=100,
+        )
+        table = format_benchmark_history([run])
+        assert table.row_count == 1
+
+    def test_format_multiple_runs(self) -> None:
+        runs = [
+            BenchmarkRun(
+                id=f"test-id-{i}",
+                benchmark_name="test",
+                timestamp=datetime(2024, 1, i + 1, 10, 30, tzinfo=UTC),
+                veritext_version="0.1.0",
+                metrics={"rouge_l": 0.8 + i * 0.01},
+                sample_count=100,
+            )
+            for i in range(3)
+        ]
+        table = format_benchmark_history(runs)
+        assert table.row_count == 3
+
+
+class TestFormatRegressionReport:
+    def test_format_no_regression(self) -> None:
+        report = RegressionReport(
+            detected=False,
+            baseline={"rouge_l": 0.85},
+            current={"rouge_l": 0.86},
+            deltas={"rouge_l": 0.01},
+            tolerance=0.05,
+        )
+        panel = format_regression_report(report)
+        assert panel.title == "Regression Check"
+        assert panel.border_style == "green"
+
+    def test_format_with_regression(self) -> None:
+        report = RegressionReport(
+            detected=True,
+            baseline={"rouge_l": 0.85, "bleu4": 0.72},
+            current={"rouge_l": 0.70, "bleu4": 0.70},
+            deltas={"rouge_l": -0.15, "bleu4": -0.02},
+            tolerance=0.05,
+        )
+        panel = format_regression_report(report)
+        assert panel.title == "Regression Check"
+        assert panel.border_style == "red"
diff --git a/tests/test_cli/test_readers.py b/tests/test_cli/test_readers.py
new file mode 100644
index 0000000..48a30de
--- /dev/null
+++ b/tests/test_cli/test_readers.py
@@ -0,0 +1,126 @@
+"""Tests for CLI input readers."""
+
+import json
+from pathlib import Path
+
+import pytest
+
+from veritext.cli.readers import TextPair, read_jsonl, read_paired_jsonl
+
+
+class TestTextPair:
+    def test_create_text_pair(self) -> None:
+        pair = TextPair(candidate="hello", reference="world")
+        assert pair.candidate == "hello"
+        assert pair.reference == "world"
+
+
+class TestReadJsonl:
+    def test_read_valid_jsonl(self, tmp_path: Path) -> None:
+        data = [
+            {"candidate": "foo", "reference": "bar"},
+            {"candidate": "baz", "reference": "qux"},
+        ]
+        jsonl_file = tmp_path / "data.jsonl"
+        jsonl_file.write_text("\n".join(json.dumps(d) for d in data))
+
+        pairs = read_jsonl(jsonl_file)
+
+        assert len(pairs) == 2
+        assert pairs[0].candidate == "foo"
+        assert pairs[0].reference == "bar"
+        assert pairs[1].candidate == "baz"
+        assert pairs[1].reference == "qux"
+
+    def test_read_empty_file(self, tmp_path: Path) -> None:
+        jsonl_file = tmp_path / "empty.jsonl"
+        jsonl_file.write_text("")
+
+        pairs = read_jsonl(jsonl_file)
+
+        assert pairs == []
+
+    def test_read_file_with_blank_lines(self, tmp_path: Path) -> None:
+        jsonl_file = tmp_path / "data.jsonl"
+        content = '{"candidate": "a", "reference": "b"}\n\n{"candidate": "c", "reference": "d"}\n'
+        jsonl_file.write_text(content)
+
+        pairs = read_jsonl(jsonl_file)
+
+        assert len(pairs) == 2
+
+    def test_read_file_not_found(self, tmp_path: Path) -> None:
+        with pytest.raises(FileNotFoundError):
+            read_jsonl(tmp_path / "nonexistent.jsonl")
+
+    def test_read_invalid_json(self, tmp_path: Path) -> None:
+        jsonl_file = tmp_path / "invalid.jsonl"
+        jsonl_file.write_text("not valid json")
+
+        with pytest.raises(ValueError, match="Invalid JSON on line 1"):
+            read_jsonl(jsonl_file)
+
+    def test_read_missing_candidate_key(self, tmp_path: Path) -> None:
+        jsonl_file = tmp_path / "data.jsonl"
+        jsonl_file.write_text('{"reference": "bar"}')
+
+        with pytest.raises(ValueError, match="Missing 'candidate' key on line 1"):
+            read_jsonl(jsonl_file)
+
+    def test_read_missing_reference_key(self, tmp_path: Path) -> None:
+        jsonl_file = tmp_path / "data.jsonl"
+        jsonl_file.write_text('{"candidate": "foo"}')
+
+        with pytest.raises(ValueError, match="Missing 'reference' key on line 1"):
+            read_jsonl(jsonl_file)
+
+
+class TestReadPairedJsonl:
+    def test_read_paired_valid(self, tmp_path: Path) -> None:
+        candidates_file = tmp_path / "candidates.jsonl"
+        references_file = tmp_path / "references.jsonl"
+
+        candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}')
+        references_file.write_text('{"text": "baz"}\n{"text": "qux"}')
+
+        pairs = read_paired_jsonl(candidates_file, references_file)
+
+        assert len(pairs) == 2
+        assert pairs[0].candidate == "foo"
+        assert pairs[0].reference == "baz"
+        assert pairs[1].candidate == "bar"
+        assert pairs[1].reference == "qux"
+
+    def test_read_paired_length_mismatch(self, tmp_path: Path) -> None:
+        candidates_file = tmp_path / "candidates.jsonl"
+        references_file = tmp_path / "references.jsonl"
+
+        candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}')
+        references_file.write_text('{"text": "baz"}')
+
+        with pytest.raises(ValueError, match="does not match"):
+            read_paired_jsonl(candidates_file, references_file)
+
+    def test_read_paired_candidates_not_found(self, tmp_path: Path) -> None:
+        references_file = tmp_path / "references.jsonl"
+        references_file.write_text('{"text": "baz"}')
+
+        with pytest.raises(FileNotFoundError, match="Candidates file not found"):
+            read_paired_jsonl(tmp_path / "nonexistent.jsonl", references_file)
+
+    def test_read_paired_references_not_found(self, tmp_path: Path) -> None:
+        candidates_file = tmp_path / "candidates.jsonl"
+        candidates_file.write_text('{"text": "foo"}')
+
+        with pytest.raises(FileNotFoundError, match="References file not found"):
+            read_paired_jsonl(candidates_file, tmp_path / "nonexistent.jsonl")
+
+    def test_read_paired_missing_text_key(self, tmp_path: Path) -> None:
+        candidates_file = tmp_path / "candidates.jsonl"
+        references_file = tmp_path / "references.jsonl"
+
+        candidates_file.write_text('{"value": "foo"}')
+        references_file.write_text('{"text": "baz"}')
+
+        with pytest.raises(ValueError, match="Missing 'text' key in candidates file"):
+            read_paired_jsonl(candidates_file, references_file)
diff --git a/tests/test_cli/test_validate.py b/tests/test_cli/test_validate.py
new file mode 100644
index 0000000..1a9d6e4
--- /dev/null
+++ b/tests/test_cli/test_validate.py
@@ -0,0 +1,214 @@
+"""Tests for CLI validate command."""
+
+import json
+from pathlib import Path
+
+from typer.testing import CliRunner
+
+from veritext.cli.main import app
+
+runner = CliRunner()
+
+
+class TestValidateInline:
+    def test_validate_inline_basic(self) -> None:
+        result = runner.invoke(
+            app,
+            [
+                "validate",
+                "The quick brown fox jumps",
+                "-r",
+                "The quick brown fox jumps",
+                "-m",
+                "bleu",
+            ],
+        )
+        assert result.exit_code == 0
+        assert "bleu4" in result.stdout
+
+    def test_validate_inline_with_rouge(self) -> None:
+        result = runner.invoke(
+            app,
+            [
+                "validate",
+                "hello world today",
+                "-r",
+                "hello world here",
+                "-m",
+                "rouge",
+            ],
+        )
+        assert result.exit_code == 0
+        assert "rouge_l" in result.stdout
+
+    def test_validate_inline_with_lexical(self) -> None:
+        result = runner.invoke(
+            app,
+            [
+                "validate",
+                "hello world",
+                "-r",
+                "hello everyone",
+                "-m",
+                "lexical",
+            ],
+        )
+        assert result.exit_code == 0
+        assert "jaccard" in result.stdout
+        assert "token_overlap" in result.stdout
+
+    def test_validate_inline_json_output(self) -> None:
+        result = runner.invoke(
+            app,
+            [
+                "validate",
+                "hello world today",
+                "-r",
+                "hello world today",
+                "-m",
+                "bleu",
+                "-o",
+                "json",
+            ],
+        )
+        assert result.exit_code == 0
+        data = json.loads(result.stdout)
+        assert "bleu4" in data
+
+    def test_validate_inline_simple_output(self) -> None:
+        result = runner.invoke(
+            app,
+            [
+                "validate",
+                "hello world today",
+                "-r",
+                "hello world today",
+                "-m",
+                "rouge",
+                "-o",
+                "simple",
+            ],
+        )
+        assert result.exit_code == 0
+        assert "rouge_l:" in result.stdout
+
+    def test_validate_inline_missing_reference(self) -> None:
+        result = runner.invoke(
+            app,
+            ["validate", "hello world", "-m", "bleu"],
+        )
+        assert result.exit_code == 1
+        assert "Error" in result.stdout
+
+    def test_validate_inline_invalid_metric(self) -> None:
+        result = runner.invoke(
+            app,
+            ["validate", "hello", "-r", "world", "-m", "invalid_metric"],
+        )
+        assert result.exit_code == 1
+        assert "Unknown metrics" in result.stdout
+
+
+class TestValidateFile:
+    def test_validate_file_basic(self, tmp_path: Path) -> None:
+        data_file = tmp_path / "data.jsonl"
+        data_file.write_text(
+            '{"candidate": "hello world today", "reference": "hello world today"}\n'
+            '{"candidate": "foo bar baz", "reference": "foo bar baz"}'
+        )
+
+        result = runner.invoke(
+            app,
+            ["validate", "-f", str(data_file), "-m", "bleu"],
+        )
+        assert result.exit_code == 0
+        assert "bleu4" in result.stdout
+        assert "Evaluated 2 text pairs" in result.stdout
+
+    def test_validate_file_not_found(self) -> None:
+        result = runner.invoke(
+            app,
+            ["validate", "-f", "/nonexistent/file.jsonl", "-m", "bleu"],
+        )
+        assert result.exit_code == 1
+        assert "Error" in result.stdout
+
+    def test_validate_paired_files(self, tmp_path: Path) -> None:
+        candidates_file = tmp_path / "candidates.jsonl"
+        references_file = tmp_path / "references.jsonl"
+
+        candidates_file.write_text(
+            '{"text": "hello world today"}\n{"text": "foo bar baz"}'
+        )
+        references_file.write_text(
+            '{"text": "hello world today"}\n{"text": "foo bar baz"}'
+        )
+
+        result = runner.invoke(
+            app,
+            [
+                "validate",
+                "-f",
+                str(candidates_file),
+                "-R",
+                str(references_file),
+                "-m",
+                "bleu",
+            ],
+        )
+        assert result.exit_code == 0
+        assert "Evaluated 2 text pairs" in result.stdout
+
+
+class TestValidateOptions:
+    def test_validate_with_threshold(self) -> None:
+        result = runner.invoke(
+            app,
+            [
+                "validate",
+                "hello world today",
+                "-r",
+                "hello world today",
+                "-m",
+                "bleu",
+                "-t",
+                "0.5",
+            ],
+        )
+        assert result.exit_code == 0
+        # Table output should include Status column
+        assert "Status" in result.stdout or "PASS" in result.stdout
+
+    def test_validate_invalid_output_format(self) -> None:
+        result = runner.invoke(
+            app,
+            [
+                "validate",
+                "hello",
+                "-r",
+                "world",
+                "-m",
+                "bleu",
+                "-o",
+                "invalid",
+            ],
+        )
+        assert result.exit_code == 1
+        assert "Invalid output format" in result.stdout
+
+    def test_validate_multiple_metrics(self) -> None:
+        result = runner.invoke(
+            app,
+            [
+                "validate",
+                "The quick brown fox",
+                "-r",
+                "The quick brown fox",
+                "-m",
+                "bleu,rouge,lexical",
+            ],
+        )
+        assert result.exit_code == 0
+        assert "bleu4" in result.stdout
+        assert "rouge_l" in result.stdout
+        assert "jaccard" in result.stdout