test(core): add tokenisation and types tests

Cover WordTokeniser (Unicode, empty input, punctuation, multiple scripts)
and validation types (immutability, edge cases, failure summary).
This commit is contained in:
2026-02-03 16:16:20 +00:00
parent a65249fa44
commit 1e3618e637
10 changed files with 348 additions and 0 deletions

View File

@@ -0,0 +1,124 @@
"""Tests for the tokenisation module."""
from typing import TYPE_CHECKING
from veritext.core.tokenisation import WordTokeniser
if TYPE_CHECKING:
from veritext.core.tokenisation import Tokeniser
class TestWordTokeniser:
"""Tests for WordTokeniser."""
def test_basic_tokenisation(self, word_tokeniser: WordTokeniser) -> None:
"""Test basic word tokenisation."""
tokens = word_tokeniser.tokenise("The cat sat on the mat")
assert tokens == ["the", "cat", "sat", "on", "the", "mat"]
def test_lowercasing(self, word_tokeniser: WordTokeniser) -> None:
"""Test that tokens are lowercased by default."""
tokens = word_tokeniser.tokenise("Hello WORLD")
assert tokens == ["hello", "world"]
def test_no_lowercasing(self, word_tokeniser_no_lowercase: WordTokeniser) -> None:
"""Test tokenisation without lowercasing."""
tokens = word_tokeniser_no_lowercase.tokenise("Hello WORLD")
assert tokens == ["Hello", "WORLD"]
def test_punctuation_removal(self, word_tokeniser: WordTokeniser) -> None:
"""Test that punctuation is removed by default."""
tokens = word_tokeniser.tokenise("Hello, world! How are you?")
assert tokens == ["hello", "world", "how", "are", "you"]
def test_keep_punctuation(
self, word_tokeniser_keep_punctuation: WordTokeniser
) -> None:
"""Test tokenisation keeping punctuation."""
tokens = word_tokeniser_keep_punctuation.tokenise("Hello, world!")
assert tokens == ["hello,", "world!"]
def test_empty_string(self, word_tokeniser: WordTokeniser) -> None:
"""Test that empty string returns empty list."""
tokens = word_tokeniser.tokenise("")
assert tokens == []
def test_whitespace_only(self, word_tokeniser: WordTokeniser) -> None:
"""Test that whitespace-only string returns empty list."""
tokens = word_tokeniser.tokenise(" \t\n ")
assert tokens == []
def test_multiple_spaces(self, word_tokeniser: WordTokeniser) -> None:
"""Test handling of multiple spaces between words."""
tokens = word_tokeniser.tokenise("hello world")
assert tokens == ["hello", "world"]
def test_unicode_nfc_normalisation(self) -> None:
"""Test NFC Unicode normalisation."""
# 'é' can be composed (U+00E9) or decomposed (e + U+0301)
composed = "caf\u00e9" # café with composed é
decomposed = "cafe\u0301" # café with decomposed é
tokeniser = WordTokeniser()
tokens_composed = tokeniser.tokenise(composed)
tokens_decomposed = tokeniser.tokenise(decomposed)
# After NFC normalisation, both should be the same
assert tokens_composed == tokens_decomposed
assert tokens_composed == ["café"]
def test_unicode_emoji(self, word_tokeniser: WordTokeniser) -> None:
"""Test handling of emoji characters."""
# Emoji are removed as punctuation by default
tokens = word_tokeniser.tokenise("Hello 👋 world 🌍")
assert tokens == ["hello", "world"]
def test_unicode_non_latin(self, word_tokeniser: WordTokeniser) -> None:
"""Test handling of non-Latin scripts."""
tokens = word_tokeniser.tokenise("日本語 テスト")
assert tokens == ["日本語", "テスト"]
def test_unicode_mixed_scripts(self, word_tokeniser: WordTokeniser) -> None:
"""Test handling of mixed scripts."""
tokens = word_tokeniser.tokenise("Hello 世界 Bonjour мир")
assert tokens == ["hello", "世界", "bonjour", "мир"]
def test_numbers_preserved(self, word_tokeniser: WordTokeniser) -> None:
"""Test that numbers are preserved."""
tokens = word_tokeniser.tokenise("I have 42 apples")
assert tokens == ["i", "have", "42", "apples"]
def test_contractions(self, word_tokeniser: WordTokeniser) -> None:
"""Test handling of contractions (apostrophe replaced with space)."""
tokens = word_tokeniser.tokenise("I can't don't won't")
# Apostrophe is replaced with space, splitting the words
assert tokens == ["i", "can", "t", "don", "t", "won", "t"]
def test_hyphenated_words(self, word_tokeniser: WordTokeniser) -> None:
"""Test handling of hyphenated words."""
tokens = word_tokeniser.tokenise("state-of-the-art")
# Hyphens are removed as punctuation
assert tokens == ["state", "of", "the", "art"]
def test_custom_normalisation_form(self) -> None:
"""Test custom Unicode normalisation form."""
tokeniser = WordTokeniser(normalisation_form="NFKC")
# NFKC normalises compatibility characters
# ™ (U+2122) is compatibility equivalent to 'TM'
tokens = tokeniser.tokenise("test™")
assert tokens == ["testtm"]
class TestTokeniserProtocol:
"""Tests for Tokeniser protocol compliance."""
def test_word_tokeniser_implements_protocol(self) -> None:
"""Test that WordTokeniser implements the Tokeniser protocol."""
tokeniser: Tokeniser = WordTokeniser()
# Check that it has the required method
assert hasattr(tokeniser, "tokenise")
assert callable(tokeniser.tokenise)
# Check return type
result = tokeniser.tokenise("test")
assert isinstance(result, list)
assert all(isinstance(t, str) for t in result)