tests for tokeniser and types
Cover WordTokeniser (Unicode, empty input, punctuation, multiple scripts) and validation types (immutability, edge cases, failure summary).
This commit is contained in:
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
BIN
tests/__pycache__/__init__.cpython-314.pyc
Normal file
BIN
tests/__pycache__/__init__.cpython-314.pyc
Normal file
Binary file not shown.
BIN
tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc
Normal file
BIN
tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc
Normal file
Binary file not shown.
20
tests/conftest.py
Normal file
20
tests/conftest.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
"""Shared pytest fixtures for Veritext tests."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from veritext.core.tokenisation import WordTokeniser
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def word_tokeniser() -> WordTokeniser:
|
||||||
|
return WordTokeniser()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def word_tokeniser_no_lowercase() -> WordTokeniser:
|
||||||
|
return WordTokeniser(lowercase=False)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def word_tokeniser_keep_punctuation() -> WordTokeniser:
|
||||||
|
return WordTokeniser(remove_punctuation=False)
|
||||||
0
tests/test_core/__init__.py
Normal file
0
tests/test_core/__init__.py
Normal file
BIN
tests/test_core/__pycache__/__init__.cpython-314.pyc
Normal file
BIN
tests/test_core/__pycache__/__init__.cpython-314.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
103
tests/test_core/test_tokenisation.py
Normal file
103
tests/test_core/test_tokenisation.py
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
"""Tests for the tokenisation module."""
|
||||||
|
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
from veritext.core.tokenisation import WordTokeniser
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from veritext.core.tokenisation import Tokeniser
|
||||||
|
|
||||||
|
|
||||||
|
class TestWordTokeniser:
|
||||||
|
def test_basic_tokenisation(self, word_tokeniser: WordTokeniser) -> None:
|
||||||
|
tokens = word_tokeniser.tokenise("The cat sat on the mat")
|
||||||
|
assert tokens == ["the", "cat", "sat", "on", "the", "mat"]
|
||||||
|
|
||||||
|
def test_lowercasing(self, word_tokeniser: WordTokeniser) -> None:
|
||||||
|
tokens = word_tokeniser.tokenise("Hello WORLD")
|
||||||
|
assert tokens == ["hello", "world"]
|
||||||
|
|
||||||
|
def test_no_lowercasing(self, word_tokeniser_no_lowercase: WordTokeniser) -> None:
|
||||||
|
tokens = word_tokeniser_no_lowercase.tokenise("Hello WORLD")
|
||||||
|
assert tokens == ["Hello", "WORLD"]
|
||||||
|
|
||||||
|
def test_punctuation_removal(self, word_tokeniser: WordTokeniser) -> None:
|
||||||
|
tokens = word_tokeniser.tokenise("Hello, world! How are you?")
|
||||||
|
assert tokens == ["hello", "world", "how", "are", "you"]
|
||||||
|
|
||||||
|
def test_keep_punctuation(
|
||||||
|
self, word_tokeniser_keep_punctuation: WordTokeniser
|
||||||
|
) -> None:
|
||||||
|
tokens = word_tokeniser_keep_punctuation.tokenise("Hello, world!")
|
||||||
|
assert tokens == ["hello,", "world!"]
|
||||||
|
|
||||||
|
def test_empty_string(self, word_tokeniser: WordTokeniser) -> None:
|
||||||
|
tokens = word_tokeniser.tokenise("")
|
||||||
|
assert tokens == []
|
||||||
|
|
||||||
|
def test_whitespace_only(self, word_tokeniser: WordTokeniser) -> None:
|
||||||
|
tokens = word_tokeniser.tokenise(" \t\n ")
|
||||||
|
assert tokens == []
|
||||||
|
|
||||||
|
def test_multiple_spaces(self, word_tokeniser: WordTokeniser) -> None:
|
||||||
|
tokens = word_tokeniser.tokenise("hello world")
|
||||||
|
assert tokens == ["hello", "world"]
|
||||||
|
|
||||||
|
def test_unicode_nfc_normalisation(self) -> None:
|
||||||
|
# 'é' can be composed (U+00E9) or decomposed (e + U+0301)
|
||||||
|
composed = "caf\u00e9" # café with composed é
|
||||||
|
decomposed = "cafe\u0301" # café with decomposed é
|
||||||
|
|
||||||
|
tokeniser = WordTokeniser()
|
||||||
|
tokens_composed = tokeniser.tokenise(composed)
|
||||||
|
tokens_decomposed = tokeniser.tokenise(decomposed)
|
||||||
|
|
||||||
|
# After NFC normalisation, both should be the same
|
||||||
|
assert tokens_composed == tokens_decomposed
|
||||||
|
assert tokens_composed == ["café"]
|
||||||
|
|
||||||
|
def test_unicode_emoji(self, word_tokeniser: WordTokeniser) -> None:
|
||||||
|
# Emoji are removed as punctuation by default
|
||||||
|
tokens = word_tokeniser.tokenise("Hello 👋 world 🌍")
|
||||||
|
assert tokens == ["hello", "world"]
|
||||||
|
|
||||||
|
def test_unicode_non_latin(self, word_tokeniser: WordTokeniser) -> None:
|
||||||
|
tokens = word_tokeniser.tokenise("日本語 テスト")
|
||||||
|
assert tokens == ["日本語", "テスト"]
|
||||||
|
|
||||||
|
def test_unicode_mixed_scripts(self, word_tokeniser: WordTokeniser) -> None:
|
||||||
|
tokens = word_tokeniser.tokenise("Hello 世界 Bonjour мир")
|
||||||
|
assert tokens == ["hello", "世界", "bonjour", "мир"]
|
||||||
|
|
||||||
|
def test_numbers_preserved(self, word_tokeniser: WordTokeniser) -> None:
|
||||||
|
tokens = word_tokeniser.tokenise("I have 42 apples")
|
||||||
|
assert tokens == ["i", "have", "42", "apples"]
|
||||||
|
|
||||||
|
def test_contractions(self, word_tokeniser: WordTokeniser) -> None:
|
||||||
|
tokens = word_tokeniser.tokenise("I can't don't won't")
|
||||||
|
# Apostrophe is replaced with space, splitting the words
|
||||||
|
assert tokens == ["i", "can", "t", "don", "t", "won", "t"]
|
||||||
|
|
||||||
|
def test_hyphenated_words(self, word_tokeniser: WordTokeniser) -> None:
|
||||||
|
tokens = word_tokeniser.tokenise("state-of-the-art")
|
||||||
|
# Hyphens are removed as punctuation
|
||||||
|
assert tokens == ["state", "of", "the", "art"]
|
||||||
|
|
||||||
|
def test_custom_normalisation_form(self) -> None:
|
||||||
|
tokeniser = WordTokeniser(normalisation_form="NFKC")
|
||||||
|
# NFKC normalises compatibility characters
|
||||||
|
# ™ (U+2122) is compatibility equivalent to 'TM'
|
||||||
|
tokens = tokeniser.tokenise("test™")
|
||||||
|
assert tokens == ["testtm"]
|
||||||
|
|
||||||
|
|
||||||
|
class TestTokeniserProtocol:
|
||||||
|
def test_word_tokeniser_implements_protocol(self) -> None:
|
||||||
|
tokeniser: Tokeniser = WordTokeniser()
|
||||||
|
# Check that it has the required method
|
||||||
|
assert hasattr(tokeniser, "tokenise")
|
||||||
|
assert callable(tokeniser.tokenise)
|
||||||
|
# Check return type
|
||||||
|
result = tokeniser.tokenise("test")
|
||||||
|
assert isinstance(result, list)
|
||||||
|
assert all(isinstance(t, str) for t in result)
|
||||||
180
tests/test_core/test_types.py
Normal file
180
tests/test_core/test_types.py
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
"""Tests for the core types module."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from pydantic import ValidationError as PydanticValidationError
|
||||||
|
|
||||||
|
from veritext.core.types import CheckResult, ValidationContext, ValidationResult
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidationContext:
|
||||||
|
def test_create_empty_context(self) -> None:
|
||||||
|
context = ValidationContext()
|
||||||
|
assert context.reference is None
|
||||||
|
assert context.metadata == {}
|
||||||
|
|
||||||
|
def test_create_with_single_reference(self) -> None:
|
||||||
|
context = ValidationContext(reference="The cat sat on the mat.")
|
||||||
|
assert context.reference == "The cat sat on the mat."
|
||||||
|
|
||||||
|
def test_create_with_multiple_references(self) -> None:
|
||||||
|
references = ["The cat sat on the mat.", "A cat was sitting on a mat."]
|
||||||
|
context = ValidationContext(reference=references)
|
||||||
|
assert context.reference == references
|
||||||
|
assert len(context.reference) == 2
|
||||||
|
|
||||||
|
def test_create_with_metadata(self) -> None:
|
||||||
|
metadata = {"source": "test", "timestamp": "2024-01-01"}
|
||||||
|
context = ValidationContext(metadata=metadata)
|
||||||
|
assert context.metadata == metadata
|
||||||
|
|
||||||
|
def test_context_is_immutable(self) -> None:
|
||||||
|
context = ValidationContext(reference="test")
|
||||||
|
with pytest.raises(PydanticValidationError):
|
||||||
|
context.reference = "new value" # type: ignore[misc]
|
||||||
|
|
||||||
|
|
||||||
|
class TestCheckResult:
|
||||||
|
def test_create_passing_result(self) -> None:
|
||||||
|
result = CheckResult(
|
||||||
|
name="bleu",
|
||||||
|
passed=True,
|
||||||
|
actual=0.85,
|
||||||
|
threshold=0.7,
|
||||||
|
message="BLEU score 0.85 >= threshold 0.7",
|
||||||
|
)
|
||||||
|
assert result.name == "bleu"
|
||||||
|
assert result.passed is True
|
||||||
|
assert result.actual == 0.85
|
||||||
|
assert result.threshold == 0.7
|
||||||
|
assert "0.85" in result.message
|
||||||
|
|
||||||
|
def test_create_failing_result(self) -> None:
|
||||||
|
result = CheckResult(
|
||||||
|
name="length",
|
||||||
|
passed=False,
|
||||||
|
actual=600,
|
||||||
|
threshold=500,
|
||||||
|
message="Length 600 exceeds maximum 500",
|
||||||
|
)
|
||||||
|
assert result.name == "length"
|
||||||
|
assert result.passed is False
|
||||||
|
assert result.actual == 600
|
||||||
|
assert result.threshold == 500
|
||||||
|
|
||||||
|
def test_create_result_without_threshold(self) -> None:
|
||||||
|
result = CheckResult(
|
||||||
|
name="contains",
|
||||||
|
passed=True,
|
||||||
|
actual=["hello", "world"],
|
||||||
|
threshold=None,
|
||||||
|
message="Found all required terms",
|
||||||
|
)
|
||||||
|
assert result.threshold is None
|
||||||
|
|
||||||
|
def test_result_is_immutable(self) -> None:
|
||||||
|
result = CheckResult(
|
||||||
|
name="test",
|
||||||
|
passed=True,
|
||||||
|
actual=1.0,
|
||||||
|
message="Test passed",
|
||||||
|
)
|
||||||
|
with pytest.raises(PydanticValidationError):
|
||||||
|
result.passed = False # type: ignore[misc]
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidationResult:
|
||||||
|
def test_all_checks_passed(self) -> None:
|
||||||
|
checks = [
|
||||||
|
CheckResult(name="bleu", passed=True, actual=0.8, message="OK"),
|
||||||
|
CheckResult(name="length", passed=True, actual=100, message="OK"),
|
||||||
|
]
|
||||||
|
result = ValidationResult(passed=True, checks=checks)
|
||||||
|
|
||||||
|
assert result.passed is True
|
||||||
|
assert len(result.checks) == 2
|
||||||
|
assert result.failed_checks == []
|
||||||
|
assert "All checks passed" in result.failure_summary
|
||||||
|
|
||||||
|
def test_some_checks_failed(self) -> None:
|
||||||
|
checks = [
|
||||||
|
CheckResult(name="bleu", passed=True, actual=0.8, message="OK"),
|
||||||
|
CheckResult(
|
||||||
|
name="length",
|
||||||
|
passed=False,
|
||||||
|
actual=600,
|
||||||
|
threshold=500,
|
||||||
|
message="Length 600 exceeds maximum 500",
|
||||||
|
),
|
||||||
|
CheckResult(
|
||||||
|
name="readability",
|
||||||
|
passed=False,
|
||||||
|
actual=12.5,
|
||||||
|
threshold=8.0,
|
||||||
|
message="Grade level 12.5 exceeds maximum 8.0",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
result = ValidationResult(passed=False, checks=checks)
|
||||||
|
|
||||||
|
assert result.passed is False
|
||||||
|
assert len(result.checks) == 3
|
||||||
|
assert len(result.failed_checks) == 2
|
||||||
|
assert result.failed_checks[0].name == "length"
|
||||||
|
assert result.failed_checks[1].name == "readability"
|
||||||
|
|
||||||
|
def test_failure_summary_format(self) -> None:
|
||||||
|
checks = [
|
||||||
|
CheckResult(
|
||||||
|
name="bleu",
|
||||||
|
passed=False,
|
||||||
|
actual=0.5,
|
||||||
|
threshold=0.7,
|
||||||
|
message="BLEU score 0.5 < threshold 0.7",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
result = ValidationResult(passed=False, checks=checks)
|
||||||
|
|
||||||
|
summary = result.failure_summary
|
||||||
|
assert "Validation failed" in summary
|
||||||
|
assert "1 check(s)" in summary
|
||||||
|
assert "bleu" in summary
|
||||||
|
assert "BLEU score 0.5 < threshold 0.7" in summary
|
||||||
|
|
||||||
|
def test_empty_checks_list(self) -> None:
|
||||||
|
result = ValidationResult(passed=True, checks=[])
|
||||||
|
assert result.passed is True
|
||||||
|
assert result.checks == []
|
||||||
|
assert result.failed_checks == []
|
||||||
|
assert "All checks passed" in result.failure_summary
|
||||||
|
|
||||||
|
def test_result_is_immutable(self) -> None:
|
||||||
|
result = ValidationResult(passed=True, checks=[])
|
||||||
|
with pytest.raises(PydanticValidationError):
|
||||||
|
result.passed = False # type: ignore[misc]
|
||||||
|
|
||||||
|
def test_check_actual_can_be_any_type(self) -> None:
|
||||||
|
# List
|
||||||
|
result1 = CheckResult(
|
||||||
|
name="contains",
|
||||||
|
passed=True,
|
||||||
|
actual=["a", "b", "c"],
|
||||||
|
message="OK",
|
||||||
|
)
|
||||||
|
assert result1.actual == ["a", "b", "c"]
|
||||||
|
|
||||||
|
# Dict
|
||||||
|
result2 = CheckResult(
|
||||||
|
name="detailed",
|
||||||
|
passed=True,
|
||||||
|
actual={"bleu1": 0.9, "bleu2": 0.8},
|
||||||
|
message="OK",
|
||||||
|
)
|
||||||
|
assert result2.actual == {"bleu1": 0.9, "bleu2": 0.8}
|
||||||
|
|
||||||
|
# Nested structure
|
||||||
|
result3 = CheckResult(
|
||||||
|
name="complex",
|
||||||
|
passed=True,
|
||||||
|
actual={"scores": [0.1, 0.2], "meta": {"key": "value"}},
|
||||||
|
message="OK",
|
||||||
|
)
|
||||||
|
assert result3.actual["scores"] == [0.1, 0.2]
|
||||||
Reference in New Issue
Block a user