diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/__pycache__/__init__.cpython-314.pyc b/tests/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000..6b2028c Binary files /dev/null and b/tests/__pycache__/__init__.cpython-314.pyc differ diff --git a/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc new file mode 100644 index 0000000..b249d20 Binary files /dev/null and b/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc differ diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..976b686 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,20 @@ +"""Shared pytest fixtures for Veritext tests.""" + +import pytest + +from veritext.core.tokenisation import WordTokeniser + + +@pytest.fixture +def word_tokeniser() -> WordTokeniser: + return WordTokeniser() + + +@pytest.fixture +def word_tokeniser_no_lowercase() -> WordTokeniser: + return WordTokeniser(lowercase=False) + + +@pytest.fixture +def word_tokeniser_keep_punctuation() -> WordTokeniser: + return WordTokeniser(remove_punctuation=False) diff --git a/tests/test_core/__init__.py b/tests/test_core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_core/__pycache__/__init__.cpython-314.pyc b/tests/test_core/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000..472ee1d Binary files /dev/null and b/tests/test_core/__pycache__/__init__.cpython-314.pyc differ diff --git a/tests/test_core/__pycache__/test_tokenisation.cpython-314-pytest-9.0.2.pyc b/tests/test_core/__pycache__/test_tokenisation.cpython-314-pytest-9.0.2.pyc new file mode 100644 index 0000000..ce4785e Binary files /dev/null and b/tests/test_core/__pycache__/test_tokenisation.cpython-314-pytest-9.0.2.pyc differ diff --git a/tests/test_core/__pycache__/test_types.cpython-314-pytest-9.0.2.pyc b/tests/test_core/__pycache__/test_types.cpython-314-pytest-9.0.2.pyc new file mode 100644 index 0000000..85d7107 Binary files /dev/null and b/tests/test_core/__pycache__/test_types.cpython-314-pytest-9.0.2.pyc differ diff --git a/tests/test_core/test_tokenisation.py b/tests/test_core/test_tokenisation.py new file mode 100644 index 0000000..daa8c7d --- /dev/null +++ b/tests/test_core/test_tokenisation.py @@ -0,0 +1,103 @@ +"""Tests for the tokenisation module.""" + +from typing import TYPE_CHECKING + +from veritext.core.tokenisation import WordTokeniser + +if TYPE_CHECKING: + from veritext.core.tokenisation import Tokeniser + + +class TestWordTokeniser: + def test_basic_tokenisation(self, word_tokeniser: WordTokeniser) -> None: + tokens = word_tokeniser.tokenise("The cat sat on the mat") + assert tokens == ["the", "cat", "sat", "on", "the", "mat"] + + def test_lowercasing(self, word_tokeniser: WordTokeniser) -> None: + tokens = word_tokeniser.tokenise("Hello WORLD") + assert tokens == ["hello", "world"] + + def test_no_lowercasing(self, word_tokeniser_no_lowercase: WordTokeniser) -> None: + tokens = word_tokeniser_no_lowercase.tokenise("Hello WORLD") + assert tokens == ["Hello", "WORLD"] + + def test_punctuation_removal(self, word_tokeniser: WordTokeniser) -> None: + tokens = word_tokeniser.tokenise("Hello, world! How are you?") + assert tokens == ["hello", "world", "how", "are", "you"] + + def test_keep_punctuation( + self, word_tokeniser_keep_punctuation: WordTokeniser + ) -> None: + tokens = word_tokeniser_keep_punctuation.tokenise("Hello, world!") + assert tokens == ["hello,", "world!"] + + def test_empty_string(self, word_tokeniser: WordTokeniser) -> None: + tokens = word_tokeniser.tokenise("") + assert tokens == [] + + def test_whitespace_only(self, word_tokeniser: WordTokeniser) -> None: + tokens = word_tokeniser.tokenise(" \t\n ") + assert tokens == [] + + def test_multiple_spaces(self, word_tokeniser: WordTokeniser) -> None: + tokens = word_tokeniser.tokenise("hello world") + assert tokens == ["hello", "world"] + + def test_unicode_nfc_normalisation(self) -> None: + # 'é' can be composed (U+00E9) or decomposed (e + U+0301) + composed = "caf\u00e9" # café with composed é + decomposed = "cafe\u0301" # café with decomposed é + + tokeniser = WordTokeniser() + tokens_composed = tokeniser.tokenise(composed) + tokens_decomposed = tokeniser.tokenise(decomposed) + + # After NFC normalisation, both should be the same + assert tokens_composed == tokens_decomposed + assert tokens_composed == ["café"] + + def test_unicode_emoji(self, word_tokeniser: WordTokeniser) -> None: + # Emoji are removed as punctuation by default + tokens = word_tokeniser.tokenise("Hello 👋 world 🌍") + assert tokens == ["hello", "world"] + + def test_unicode_non_latin(self, word_tokeniser: WordTokeniser) -> None: + tokens = word_tokeniser.tokenise("日本語 テスト") + assert tokens == ["日本語", "テスト"] + + def test_unicode_mixed_scripts(self, word_tokeniser: WordTokeniser) -> None: + tokens = word_tokeniser.tokenise("Hello 世界 Bonjour мир") + assert tokens == ["hello", "世界", "bonjour", "мир"] + + def test_numbers_preserved(self, word_tokeniser: WordTokeniser) -> None: + tokens = word_tokeniser.tokenise("I have 42 apples") + assert tokens == ["i", "have", "42", "apples"] + + def test_contractions(self, word_tokeniser: WordTokeniser) -> None: + tokens = word_tokeniser.tokenise("I can't don't won't") + # Apostrophe is replaced with space, splitting the words + assert tokens == ["i", "can", "t", "don", "t", "won", "t"] + + def test_hyphenated_words(self, word_tokeniser: WordTokeniser) -> None: + tokens = word_tokeniser.tokenise("state-of-the-art") + # Hyphens are removed as punctuation + assert tokens == ["state", "of", "the", "art"] + + def test_custom_normalisation_form(self) -> None: + tokeniser = WordTokeniser(normalisation_form="NFKC") + # NFKC normalises compatibility characters + # ™ (U+2122) is compatibility equivalent to 'TM' + tokens = tokeniser.tokenise("test™") + assert tokens == ["testtm"] + + +class TestTokeniserProtocol: + def test_word_tokeniser_implements_protocol(self) -> None: + tokeniser: Tokeniser = WordTokeniser() + # Check that it has the required method + assert hasattr(tokeniser, "tokenise") + assert callable(tokeniser.tokenise) + # Check return type + result = tokeniser.tokenise("test") + assert isinstance(result, list) + assert all(isinstance(t, str) for t in result) diff --git a/tests/test_core/test_types.py b/tests/test_core/test_types.py new file mode 100644 index 0000000..6068f20 --- /dev/null +++ b/tests/test_core/test_types.py @@ -0,0 +1,180 @@ +"""Tests for the core types module.""" + +import pytest +from pydantic import ValidationError as PydanticValidationError + +from veritext.core.types import CheckResult, ValidationContext, ValidationResult + + +class TestValidationContext: + def test_create_empty_context(self) -> None: + context = ValidationContext() + assert context.reference is None + assert context.metadata == {} + + def test_create_with_single_reference(self) -> None: + context = ValidationContext(reference="The cat sat on the mat.") + assert context.reference == "The cat sat on the mat." + + def test_create_with_multiple_references(self) -> None: + references = ["The cat sat on the mat.", "A cat was sitting on a mat."] + context = ValidationContext(reference=references) + assert context.reference == references + assert len(context.reference) == 2 + + def test_create_with_metadata(self) -> None: + metadata = {"source": "test", "timestamp": "2024-01-01"} + context = ValidationContext(metadata=metadata) + assert context.metadata == metadata + + def test_context_is_immutable(self) -> None: + context = ValidationContext(reference="test") + with pytest.raises(PydanticValidationError): + context.reference = "new value" # type: ignore[misc] + + +class TestCheckResult: + def test_create_passing_result(self) -> None: + result = CheckResult( + name="bleu", + passed=True, + actual=0.85, + threshold=0.7, + message="BLEU score 0.85 >= threshold 0.7", + ) + assert result.name == "bleu" + assert result.passed is True + assert result.actual == 0.85 + assert result.threshold == 0.7 + assert "0.85" in result.message + + def test_create_failing_result(self) -> None: + result = CheckResult( + name="length", + passed=False, + actual=600, + threshold=500, + message="Length 600 exceeds maximum 500", + ) + assert result.name == "length" + assert result.passed is False + assert result.actual == 600 + assert result.threshold == 500 + + def test_create_result_without_threshold(self) -> None: + result = CheckResult( + name="contains", + passed=True, + actual=["hello", "world"], + threshold=None, + message="Found all required terms", + ) + assert result.threshold is None + + def test_result_is_immutable(self) -> None: + result = CheckResult( + name="test", + passed=True, + actual=1.0, + message="Test passed", + ) + with pytest.raises(PydanticValidationError): + result.passed = False # type: ignore[misc] + + +class TestValidationResult: + def test_all_checks_passed(self) -> None: + checks = [ + CheckResult(name="bleu", passed=True, actual=0.8, message="OK"), + CheckResult(name="length", passed=True, actual=100, message="OK"), + ] + result = ValidationResult(passed=True, checks=checks) + + assert result.passed is True + assert len(result.checks) == 2 + assert result.failed_checks == [] + assert "All checks passed" in result.failure_summary + + def test_some_checks_failed(self) -> None: + checks = [ + CheckResult(name="bleu", passed=True, actual=0.8, message="OK"), + CheckResult( + name="length", + passed=False, + actual=600, + threshold=500, + message="Length 600 exceeds maximum 500", + ), + CheckResult( + name="readability", + passed=False, + actual=12.5, + threshold=8.0, + message="Grade level 12.5 exceeds maximum 8.0", + ), + ] + result = ValidationResult(passed=False, checks=checks) + + assert result.passed is False + assert len(result.checks) == 3 + assert len(result.failed_checks) == 2 + assert result.failed_checks[0].name == "length" + assert result.failed_checks[1].name == "readability" + + def test_failure_summary_format(self) -> None: + checks = [ + CheckResult( + name="bleu", + passed=False, + actual=0.5, + threshold=0.7, + message="BLEU score 0.5 < threshold 0.7", + ), + ] + result = ValidationResult(passed=False, checks=checks) + + summary = result.failure_summary + assert "Validation failed" in summary + assert "1 check(s)" in summary + assert "bleu" in summary + assert "BLEU score 0.5 < threshold 0.7" in summary + + def test_empty_checks_list(self) -> None: + result = ValidationResult(passed=True, checks=[]) + assert result.passed is True + assert result.checks == [] + assert result.failed_checks == [] + assert "All checks passed" in result.failure_summary + + def test_result_is_immutable(self) -> None: + result = ValidationResult(passed=True, checks=[]) + with pytest.raises(PydanticValidationError): + result.passed = False # type: ignore[misc] + + def test_check_actual_can_be_any_type(self) -> None: + # List + result1 = CheckResult( + name="contains", + passed=True, + actual=["a", "b", "c"], + message="OK", + ) + assert result1.actual == ["a", "b", "c"] + + # Dict + result2 = CheckResult( + name="detailed", + passed=True, + actual={"bleu1": 0.9, "bleu2": 0.8}, + message="OK", + ) + assert result2.actual == {"bleu1": 0.9, "bleu2": 0.8} + + # Nested structure + result3 = CheckResult( + name="complex", + passed=True, + actual={"scores": [0.1, 0.2], "meta": {"key": "value"}}, + message="OK", + ) + assert result3.actual["scores"] == [0.1, 0.2]