"""Tests for the tokenisation module.""" from typing import TYPE_CHECKING from veritext.core.tokenisation import WordTokeniser if TYPE_CHECKING: from veritext.core.tokenisation import Tokeniser class TestWordTokeniser: """Tests for WordTokeniser.""" def test_basic_tokenisation(self, word_tokeniser: WordTokeniser) -> None: """Test basic word tokenisation.""" tokens = word_tokeniser.tokenise("The cat sat on the mat") assert tokens == ["the", "cat", "sat", "on", "the", "mat"] def test_lowercasing(self, word_tokeniser: WordTokeniser) -> None: """Test that tokens are lowercased by default.""" tokens = word_tokeniser.tokenise("Hello WORLD") assert tokens == ["hello", "world"] def test_no_lowercasing(self, word_tokeniser_no_lowercase: WordTokeniser) -> None: """Test tokenisation without lowercasing.""" tokens = word_tokeniser_no_lowercase.tokenise("Hello WORLD") assert tokens == ["Hello", "WORLD"] def test_punctuation_removal(self, word_tokeniser: WordTokeniser) -> None: """Test that punctuation is removed by default.""" tokens = word_tokeniser.tokenise("Hello, world! How are you?") assert tokens == ["hello", "world", "how", "are", "you"] def test_keep_punctuation( self, word_tokeniser_keep_punctuation: WordTokeniser ) -> None: """Test tokenisation keeping punctuation.""" tokens = word_tokeniser_keep_punctuation.tokenise("Hello, world!") assert tokens == ["hello,", "world!"] def test_empty_string(self, word_tokeniser: WordTokeniser) -> None: """Test that empty string returns empty list.""" tokens = word_tokeniser.tokenise("") assert tokens == [] def test_whitespace_only(self, word_tokeniser: WordTokeniser) -> None: """Test that whitespace-only string returns empty list.""" tokens = word_tokeniser.tokenise(" \t\n ") assert tokens == [] def test_multiple_spaces(self, word_tokeniser: WordTokeniser) -> None: """Test handling of multiple spaces between words.""" tokens = word_tokeniser.tokenise("hello world") assert tokens == ["hello", "world"] def test_unicode_nfc_normalisation(self) -> None: """Test NFC Unicode normalisation.""" # 'é' can be composed (U+00E9) or decomposed (e + U+0301) composed = "caf\u00e9" # café with composed é decomposed = "cafe\u0301" # café with decomposed é tokeniser = WordTokeniser() tokens_composed = tokeniser.tokenise(composed) tokens_decomposed = tokeniser.tokenise(decomposed) # After NFC normalisation, both should be the same assert tokens_composed == tokens_decomposed assert tokens_composed == ["café"] def test_unicode_emoji(self, word_tokeniser: WordTokeniser) -> None: """Test handling of emoji characters.""" # Emoji are removed as punctuation by default tokens = word_tokeniser.tokenise("Hello 👋 world 🌍") assert tokens == ["hello", "world"] def test_unicode_non_latin(self, word_tokeniser: WordTokeniser) -> None: """Test handling of non-Latin scripts.""" tokens = word_tokeniser.tokenise("日本語 テスト") assert tokens == ["日本語", "テスト"] def test_unicode_mixed_scripts(self, word_tokeniser: WordTokeniser) -> None: """Test handling of mixed scripts.""" tokens = word_tokeniser.tokenise("Hello 世界 Bonjour мир") assert tokens == ["hello", "世界", "bonjour", "мир"] def test_numbers_preserved(self, word_tokeniser: WordTokeniser) -> None: """Test that numbers are preserved.""" tokens = word_tokeniser.tokenise("I have 42 apples") assert tokens == ["i", "have", "42", "apples"] def test_contractions(self, word_tokeniser: WordTokeniser) -> None: """Test handling of contractions (apostrophe replaced with space).""" tokens = word_tokeniser.tokenise("I can't don't won't") # Apostrophe is replaced with space, splitting the words assert tokens == ["i", "can", "t", "don", "t", "won", "t"] def test_hyphenated_words(self, word_tokeniser: WordTokeniser) -> None: """Test handling of hyphenated words.""" tokens = word_tokeniser.tokenise("state-of-the-art") # Hyphens are removed as punctuation assert tokens == ["state", "of", "the", "art"] def test_custom_normalisation_form(self) -> None: """Test custom Unicode normalisation form.""" tokeniser = WordTokeniser(normalisation_form="NFKC") # NFKC normalises compatibility characters # ™ (U+2122) is compatibility equivalent to 'TM' tokens = tokeniser.tokenise("test™") assert tokens == ["testtm"] class TestTokeniserProtocol: """Tests for Tokeniser protocol compliance.""" def test_word_tokeniser_implements_protocol(self) -> None: """Test that WordTokeniser implements the Tokeniser protocol.""" tokeniser: Tokeniser = WordTokeniser() # Check that it has the required method assert hasattr(tokeniser, "tokenise") assert callable(tokeniser.tokenise) # Check return type result = tokeniser.tokenise("test") assert isinstance(result, list) assert all(isinstance(t, str) for t in result)