tests for tokeniser and types

Cover WordTokeniser (Unicode, empty input, punctuation, multiple scripts) and validation types (immutability, edge cases, failure summary).
2025-03-09 11:42:26 +00:00
parent 1fb9e1f835
commit 2827dcdf4e
10 changed files with 303 additions and 0 deletions
@@ -0,0 +1,103 @@
+"""Tests for the tokenisation module."""
+
+from typing import TYPE_CHECKING
+
+from veritext.core.tokenisation import WordTokeniser
+
+if TYPE_CHECKING:
+    from veritext.core.tokenisation import Tokeniser
+
+
+class TestWordTokeniser:
+    def test_basic_tokenisation(self, word_tokeniser: WordTokeniser) -> None:
+        tokens = word_tokeniser.tokenise("The cat sat on the mat")
+        assert tokens == ["the", "cat", "sat", "on", "the", "mat"]
+
+    def test_lowercasing(self, word_tokeniser: WordTokeniser) -> None:
+        tokens = word_tokeniser.tokenise("Hello WORLD")
+        assert tokens == ["hello", "world"]
+
+    def test_no_lowercasing(self, word_tokeniser_no_lowercase: WordTokeniser) -> None:
+        tokens = word_tokeniser_no_lowercase.tokenise("Hello WORLD")
+        assert tokens == ["Hello", "WORLD"]
+
+    def test_punctuation_removal(self, word_tokeniser: WordTokeniser) -> None:
+        tokens = word_tokeniser.tokenise("Hello, world! How are you?")
+        assert tokens == ["hello", "world", "how", "are", "you"]
+
+    def test_keep_punctuation(
+        self, word_tokeniser_keep_punctuation: WordTokeniser
+    ) -> None:
+        tokens = word_tokeniser_keep_punctuation.tokenise("Hello, world!")
+        assert tokens == ["hello,", "world!"]
+
+    def test_empty_string(self, word_tokeniser: WordTokeniser) -> None:
+        tokens = word_tokeniser.tokenise("")
+        assert tokens == []
+
+    def test_whitespace_only(self, word_tokeniser: WordTokeniser) -> None:
+        tokens = word_tokeniser.tokenise("   \t\n  ")
+        assert tokens == []
+
+    def test_multiple_spaces(self, word_tokeniser: WordTokeniser) -> None:
+        tokens = word_tokeniser.tokenise("hello    world")
+        assert tokens == ["hello", "world"]
+
+    def test_unicode_nfc_normalisation(self) -> None:
+        # 'é' can be composed (U+00E9) or decomposed (e + U+0301)
+        composed = "caf\u00e9"  # café with composed é
+        decomposed = "cafe\u0301"  # café with decomposed é
+
+        tokeniser = WordTokeniser()
+        tokens_composed = tokeniser.tokenise(composed)
+        tokens_decomposed = tokeniser.tokenise(decomposed)
+
+        # After NFC normalisation, both should be the same
+        assert tokens_composed == tokens_decomposed
+        assert tokens_composed == ["café"]
+
+    def test_unicode_emoji(self, word_tokeniser: WordTokeniser) -> None:
+        # Emoji are removed as punctuation by default
+        tokens = word_tokeniser.tokenise("Hello 👋 world 🌍")
+        assert tokens == ["hello", "world"]
+
+    def test_unicode_non_latin(self, word_tokeniser: WordTokeniser) -> None:
+        tokens = word_tokeniser.tokenise("日本語 テスト")
+        assert tokens == ["日本語", "テスト"]
+
+    def test_unicode_mixed_scripts(self, word_tokeniser: WordTokeniser) -> None:
+        tokens = word_tokeniser.tokenise("Hello 世界 Bonjour мир")
+        assert tokens == ["hello", "世界", "bonjour", "мир"]
+
+    def test_numbers_preserved(self, word_tokeniser: WordTokeniser) -> None:
+        tokens = word_tokeniser.tokenise("I have 42 apples")
+        assert tokens == ["i", "have", "42", "apples"]
+
+    def test_contractions(self, word_tokeniser: WordTokeniser) -> None:
+        tokens = word_tokeniser.tokenise("I can't don't won't")
+        # Apostrophe is replaced with space, splitting the words
+        assert tokens == ["i", "can", "t", "don", "t", "won", "t"]
+
+    def test_hyphenated_words(self, word_tokeniser: WordTokeniser) -> None:
+        tokens = word_tokeniser.tokenise("state-of-the-art")
+        # Hyphens are removed as punctuation
+        assert tokens == ["state", "of", "the", "art"]
+
+    def test_custom_normalisation_form(self) -> None:
+        tokeniser = WordTokeniser(normalisation_form="NFKC")
+        # NFKC normalises compatibility characters
+        # ™ (U+2122) is compatibility equivalent to 'TM'
+        tokens = tokeniser.tokenise("test™")
+        assert tokens == ["testtm"]
+
+
+class TestTokeniserProtocol:
+    def test_word_tokeniser_implements_protocol(self) -> None:
+        tokeniser: Tokeniser = WordTokeniser()
+        # Check that it has the required method
+        assert hasattr(tokeniser, "tokenise")
+        assert callable(tokeniser.tokenise)
+        # Check return type
+        result = tokeniser.tokenise("test")
+        assert isinstance(result, list)
+        assert all(isinstance(t, str) for t in result)
@@ -0,0 +1,180 @@
+"""Tests for the core types module."""
+
+import pytest
+from pydantic import ValidationError as PydanticValidationError
+
+from veritext.core.types import CheckResult, ValidationContext, ValidationResult
+
+
+class TestValidationContext:
+    def test_create_empty_context(self) -> None:
+        context = ValidationContext()
+        assert context.reference is None
+        assert context.metadata == {}
+
+    def test_create_with_single_reference(self) -> None:
+        context = ValidationContext(reference="The cat sat on the mat.")
+        assert context.reference == "The cat sat on the mat."
+
+    def test_create_with_multiple_references(self) -> None:
+        references = ["The cat sat on the mat.", "A cat was sitting on a mat."]
+        context = ValidationContext(reference=references)
+        assert context.reference == references
+        assert len(context.reference) == 2
+
+    def test_create_with_metadata(self) -> None:
+        metadata = {"source": "test", "timestamp": "2024-01-01"}
+        context = ValidationContext(metadata=metadata)
+        assert context.metadata == metadata
+
+    def test_context_is_immutable(self) -> None:
+        context = ValidationContext(reference="test")
+        with pytest.raises(PydanticValidationError):
+            context.reference = "new value"  # type: ignore[misc]
+
+
+class TestCheckResult:
+    def test_create_passing_result(self) -> None:
+        result = CheckResult(
+            name="bleu",
+            passed=True,
+            actual=0.85,
+            threshold=0.7,
+            message="BLEU score 0.85 >= threshold 0.7",
+        )
+        assert result.name == "bleu"
+        assert result.passed is True
+        assert result.actual == 0.85
+        assert result.threshold == 0.7
+        assert "0.85" in result.message
+
+    def test_create_failing_result(self) -> None:
+        result = CheckResult(
+            name="length",
+            passed=False,
+            actual=600,
+            threshold=500,
+            message="Length 600 exceeds maximum 500",
+        )
+        assert result.name == "length"
+        assert result.passed is False
+        assert result.actual == 600
+        assert result.threshold == 500
+
+    def test_create_result_without_threshold(self) -> None:
+        result = CheckResult(
+            name="contains",
+            passed=True,
+            actual=["hello", "world"],
+            threshold=None,
+            message="Found all required terms",
+        )
+        assert result.threshold is None
+
+    def test_result_is_immutable(self) -> None:
+        result = CheckResult(
+            name="test",
+            passed=True,
+            actual=1.0,
+            message="Test passed",
+        )
+        with pytest.raises(PydanticValidationError):
+            result.passed = False  # type: ignore[misc]
+
+
+class TestValidationResult:
+    def test_all_checks_passed(self) -> None:
+        checks = [
+            CheckResult(name="bleu", passed=True, actual=0.8, message="OK"),
+            CheckResult(name="length", passed=True, actual=100, message="OK"),
+        ]
+        result = ValidationResult(passed=True, checks=checks)
+
+        assert result.passed is True
+        assert len(result.checks) == 2
+        assert result.failed_checks == []
+        assert "All checks passed" in result.failure_summary
+
+    def test_some_checks_failed(self) -> None:
+        checks = [
+            CheckResult(name="bleu", passed=True, actual=0.8, message="OK"),
+            CheckResult(
+                name="length",
+                passed=False,
+                actual=600,
+                threshold=500,
+                message="Length 600 exceeds maximum 500",
+            ),
+            CheckResult(
+                name="readability",
+                passed=False,
+                actual=12.5,
+                threshold=8.0,
+                message="Grade level 12.5 exceeds maximum 8.0",
+            ),
+        ]
+        result = ValidationResult(passed=False, checks=checks)
+
+        assert result.passed is False
+        assert len(result.checks) == 3
+        assert len(result.failed_checks) == 2
+        assert result.failed_checks[0].name == "length"
+        assert result.failed_checks[1].name == "readability"
+
+    def test_failure_summary_format(self) -> None:
+        checks = [
+            CheckResult(
+                name="bleu",
+                passed=False,
+                actual=0.5,
+                threshold=0.7,
+                message="BLEU score 0.5 < threshold 0.7",
+            ),
+        ]
+        result = ValidationResult(passed=False, checks=checks)
+
+        summary = result.failure_summary
+        assert "Validation failed" in summary
+        assert "1 check(s)" in summary
+        assert "bleu" in summary
+        assert "BLEU score 0.5 < threshold 0.7" in summary
+
+    def test_empty_checks_list(self) -> None:
+        result = ValidationResult(passed=True, checks=[])
+        assert result.passed is True
+        assert result.checks == []
+        assert result.failed_checks == []
+        assert "All checks passed" in result.failure_summary
+
+    def test_result_is_immutable(self) -> None:
+        result = ValidationResult(passed=True, checks=[])
+        with pytest.raises(PydanticValidationError):
+            result.passed = False  # type: ignore[misc]
+
+    def test_check_actual_can_be_any_type(self) -> None:
+        # List
+        result1 = CheckResult(
+            name="contains",
+            passed=True,
+            actual=["a", "b", "c"],
+            message="OK",
+        )
+        assert result1.actual == ["a", "b", "c"]
+
+        # Dict
+        result2 = CheckResult(
+            name="detailed",
+            passed=True,
+            actual={"bleu1": 0.9, "bleu2": 0.8},
+            message="OK",
+        )
+        assert result2.actual == {"bleu1": 0.9, "bleu2": 0.8}
+
+        # Nested structure
+        result3 = CheckResult(
+            name="complex",
+            passed=True,
+            actual={"scores": [0.1, 0.2], "meta": {"key": "value"}},
+            message="OK",
+        )
+        assert result3.actual["scores"] == [0.1, 0.2]