Cover WordTokeniser (Unicode, empty input, punctuation, multiple scripts) and validation types (immutability, edge cases, failure summary).
104 lines
4.3 KiB
Python
104 lines
4.3 KiB
Python
"""Tests for the tokenisation module."""
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
from veritext.core.tokenisation import WordTokeniser
|
|
|
|
if TYPE_CHECKING:
|
|
from veritext.core.tokenisation import Tokeniser
|
|
|
|
|
|
class TestWordTokeniser:
|
|
def test_basic_tokenisation(self, word_tokeniser: WordTokeniser) -> None:
|
|
tokens = word_tokeniser.tokenise("The cat sat on the mat")
|
|
assert tokens == ["the", "cat", "sat", "on", "the", "mat"]
|
|
|
|
def test_lowercasing(self, word_tokeniser: WordTokeniser) -> None:
|
|
tokens = word_tokeniser.tokenise("Hello WORLD")
|
|
assert tokens == ["hello", "world"]
|
|
|
|
def test_no_lowercasing(self, word_tokeniser_no_lowercase: WordTokeniser) -> None:
|
|
tokens = word_tokeniser_no_lowercase.tokenise("Hello WORLD")
|
|
assert tokens == ["Hello", "WORLD"]
|
|
|
|
def test_punctuation_removal(self, word_tokeniser: WordTokeniser) -> None:
|
|
tokens = word_tokeniser.tokenise("Hello, world! How are you?")
|
|
assert tokens == ["hello", "world", "how", "are", "you"]
|
|
|
|
def test_keep_punctuation(
|
|
self, word_tokeniser_keep_punctuation: WordTokeniser
|
|
) -> None:
|
|
tokens = word_tokeniser_keep_punctuation.tokenise("Hello, world!")
|
|
assert tokens == ["hello,", "world!"]
|
|
|
|
def test_empty_string(self, word_tokeniser: WordTokeniser) -> None:
|
|
tokens = word_tokeniser.tokenise("")
|
|
assert tokens == []
|
|
|
|
def test_whitespace_only(self, word_tokeniser: WordTokeniser) -> None:
|
|
tokens = word_tokeniser.tokenise(" \t\n ")
|
|
assert tokens == []
|
|
|
|
def test_multiple_spaces(self, word_tokeniser: WordTokeniser) -> None:
|
|
tokens = word_tokeniser.tokenise("hello world")
|
|
assert tokens == ["hello", "world"]
|
|
|
|
def test_unicode_nfc_normalisation(self) -> None:
|
|
# 'é' can be composed (U+00E9) or decomposed (e + U+0301)
|
|
composed = "caf\u00e9" # café with composed é
|
|
decomposed = "cafe\u0301" # café with decomposed é
|
|
|
|
tokeniser = WordTokeniser()
|
|
tokens_composed = tokeniser.tokenise(composed)
|
|
tokens_decomposed = tokeniser.tokenise(decomposed)
|
|
|
|
# After NFC normalisation, both should be the same
|
|
assert tokens_composed == tokens_decomposed
|
|
assert tokens_composed == ["café"]
|
|
|
|
def test_unicode_emoji(self, word_tokeniser: WordTokeniser) -> None:
|
|
# Emoji are removed as punctuation by default
|
|
tokens = word_tokeniser.tokenise("Hello 👋 world 🌍")
|
|
assert tokens == ["hello", "world"]
|
|
|
|
def test_unicode_non_latin(self, word_tokeniser: WordTokeniser) -> None:
|
|
tokens = word_tokeniser.tokenise("日本語 テスト")
|
|
assert tokens == ["日本語", "テスト"]
|
|
|
|
def test_unicode_mixed_scripts(self, word_tokeniser: WordTokeniser) -> None:
|
|
tokens = word_tokeniser.tokenise("Hello 世界 Bonjour мир")
|
|
assert tokens == ["hello", "世界", "bonjour", "мир"]
|
|
|
|
def test_numbers_preserved(self, word_tokeniser: WordTokeniser) -> None:
|
|
tokens = word_tokeniser.tokenise("I have 42 apples")
|
|
assert tokens == ["i", "have", "42", "apples"]
|
|
|
|
def test_contractions(self, word_tokeniser: WordTokeniser) -> None:
|
|
tokens = word_tokeniser.tokenise("I can't don't won't")
|
|
# Apostrophe is replaced with space, splitting the words
|
|
assert tokens == ["i", "can", "t", "don", "t", "won", "t"]
|
|
|
|
def test_hyphenated_words(self, word_tokeniser: WordTokeniser) -> None:
|
|
tokens = word_tokeniser.tokenise("state-of-the-art")
|
|
# Hyphens are removed as punctuation
|
|
assert tokens == ["state", "of", "the", "art"]
|
|
|
|
def test_custom_normalisation_form(self) -> None:
|
|
tokeniser = WordTokeniser(normalisation_form="NFKC")
|
|
# NFKC normalises compatibility characters
|
|
# ™ (U+2122) is compatibility equivalent to 'TM'
|
|
tokens = tokeniser.tokenise("test™")
|
|
assert tokens == ["testtm"]
|
|
|
|
|
|
class TestTokeniserProtocol:
|
|
def test_word_tokeniser_implements_protocol(self) -> None:
|
|
tokeniser: Tokeniser = WordTokeniser()
|
|
# Check that it has the required method
|
|
assert hasattr(tokeniser, "tokenise")
|
|
assert callable(tokeniser.tokenise)
|
|
# Check return type
|
|
result = tokeniser.tokenise("test")
|
|
assert isinstance(result, list)
|
|
assert all(isinstance(t, str) for t in result)
|