Cover WordTokeniser (Unicode, empty input, punctuation, multiple scripts) and validation types (immutability, edge cases, failure summary).
125 lines
5.3 KiB
Python
125 lines
5.3 KiB
Python
"""Tests for the tokenisation module."""
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
from veritext.core.tokenisation import WordTokeniser
|
|
|
|
if TYPE_CHECKING:
|
|
from veritext.core.tokenisation import Tokeniser
|
|
|
|
|
|
class TestWordTokeniser:
|
|
"""Tests for WordTokeniser."""
|
|
|
|
def test_basic_tokenisation(self, word_tokeniser: WordTokeniser) -> None:
|
|
"""Test basic word tokenisation."""
|
|
tokens = word_tokeniser.tokenise("The cat sat on the mat")
|
|
assert tokens == ["the", "cat", "sat", "on", "the", "mat"]
|
|
|
|
def test_lowercasing(self, word_tokeniser: WordTokeniser) -> None:
|
|
"""Test that tokens are lowercased by default."""
|
|
tokens = word_tokeniser.tokenise("Hello WORLD")
|
|
assert tokens == ["hello", "world"]
|
|
|
|
def test_no_lowercasing(self, word_tokeniser_no_lowercase: WordTokeniser) -> None:
|
|
"""Test tokenisation without lowercasing."""
|
|
tokens = word_tokeniser_no_lowercase.tokenise("Hello WORLD")
|
|
assert tokens == ["Hello", "WORLD"]
|
|
|
|
def test_punctuation_removal(self, word_tokeniser: WordTokeniser) -> None:
|
|
"""Test that punctuation is removed by default."""
|
|
tokens = word_tokeniser.tokenise("Hello, world! How are you?")
|
|
assert tokens == ["hello", "world", "how", "are", "you"]
|
|
|
|
def test_keep_punctuation(
|
|
self, word_tokeniser_keep_punctuation: WordTokeniser
|
|
) -> None:
|
|
"""Test tokenisation keeping punctuation."""
|
|
tokens = word_tokeniser_keep_punctuation.tokenise("Hello, world!")
|
|
assert tokens == ["hello,", "world!"]
|
|
|
|
def test_empty_string(self, word_tokeniser: WordTokeniser) -> None:
|
|
"""Test that empty string returns empty list."""
|
|
tokens = word_tokeniser.tokenise("")
|
|
assert tokens == []
|
|
|
|
def test_whitespace_only(self, word_tokeniser: WordTokeniser) -> None:
|
|
"""Test that whitespace-only string returns empty list."""
|
|
tokens = word_tokeniser.tokenise(" \t\n ")
|
|
assert tokens == []
|
|
|
|
def test_multiple_spaces(self, word_tokeniser: WordTokeniser) -> None:
|
|
"""Test handling of multiple spaces between words."""
|
|
tokens = word_tokeniser.tokenise("hello world")
|
|
assert tokens == ["hello", "world"]
|
|
|
|
def test_unicode_nfc_normalisation(self) -> None:
|
|
"""Test NFC Unicode normalisation."""
|
|
# 'é' can be composed (U+00E9) or decomposed (e + U+0301)
|
|
composed = "caf\u00e9" # café with composed é
|
|
decomposed = "cafe\u0301" # café with decomposed é
|
|
|
|
tokeniser = WordTokeniser()
|
|
tokens_composed = tokeniser.tokenise(composed)
|
|
tokens_decomposed = tokeniser.tokenise(decomposed)
|
|
|
|
# After NFC normalisation, both should be the same
|
|
assert tokens_composed == tokens_decomposed
|
|
assert tokens_composed == ["café"]
|
|
|
|
def test_unicode_emoji(self, word_tokeniser: WordTokeniser) -> None:
|
|
"""Test handling of emoji characters."""
|
|
# Emoji are removed as punctuation by default
|
|
tokens = word_tokeniser.tokenise("Hello 👋 world 🌍")
|
|
assert tokens == ["hello", "world"]
|
|
|
|
def test_unicode_non_latin(self, word_tokeniser: WordTokeniser) -> None:
|
|
"""Test handling of non-Latin scripts."""
|
|
tokens = word_tokeniser.tokenise("日本語 テスト")
|
|
assert tokens == ["日本語", "テスト"]
|
|
|
|
def test_unicode_mixed_scripts(self, word_tokeniser: WordTokeniser) -> None:
|
|
"""Test handling of mixed scripts."""
|
|
tokens = word_tokeniser.tokenise("Hello 世界 Bonjour мир")
|
|
assert tokens == ["hello", "世界", "bonjour", "мир"]
|
|
|
|
def test_numbers_preserved(self, word_tokeniser: WordTokeniser) -> None:
|
|
"""Test that numbers are preserved."""
|
|
tokens = word_tokeniser.tokenise("I have 42 apples")
|
|
assert tokens == ["i", "have", "42", "apples"]
|
|
|
|
def test_contractions(self, word_tokeniser: WordTokeniser) -> None:
|
|
"""Test handling of contractions (apostrophe replaced with space)."""
|
|
tokens = word_tokeniser.tokenise("I can't don't won't")
|
|
# Apostrophe is replaced with space, splitting the words
|
|
assert tokens == ["i", "can", "t", "don", "t", "won", "t"]
|
|
|
|
def test_hyphenated_words(self, word_tokeniser: WordTokeniser) -> None:
|
|
"""Test handling of hyphenated words."""
|
|
tokens = word_tokeniser.tokenise("state-of-the-art")
|
|
# Hyphens are removed as punctuation
|
|
assert tokens == ["state", "of", "the", "art"]
|
|
|
|
def test_custom_normalisation_form(self) -> None:
|
|
"""Test custom Unicode normalisation form."""
|
|
tokeniser = WordTokeniser(normalisation_form="NFKC")
|
|
# NFKC normalises compatibility characters
|
|
# ™ (U+2122) is compatibility equivalent to 'TM'
|
|
tokens = tokeniser.tokenise("test™")
|
|
assert tokens == ["testtm"]
|
|
|
|
|
|
class TestTokeniserProtocol:
|
|
"""Tests for Tokeniser protocol compliance."""
|
|
|
|
def test_word_tokeniser_implements_protocol(self) -> None:
|
|
"""Test that WordTokeniser implements the Tokeniser protocol."""
|
|
tokeniser: Tokeniser = WordTokeniser()
|
|
# Check that it has the required method
|
|
assert hasattr(tokeniser, "tokenise")
|
|
assert callable(tokeniser.tokenise)
|
|
# Check return type
|
|
result = tokeniser.tokenise("test")
|
|
assert isinstance(result, list)
|
|
assert all(isinstance(t, str) for t in result)
|