feat(core): add tokenisation with unicode support

Implement Tokeniser protocol and WordTokeniser class with NFC Unicode normalisation, optional lowercasing, and punctuation removal.
2026-02-03 16:16:07 +00:00
parent efc6a031a3
commit 697b1ddfeb
1 changed files with 81 additions and 0 deletions
@@ -0,0 +1,81 @@
 """Tokenisation utilities for text processing."""
 import re
 import unicodedata
 from typing import Literal, Protocol
 NormalisationForm = Literal["NFC", "NFD", "NFKC", "NFKD"]
 class Tokeniser(Protocol):
    """Protocol for text tokenisers."""
    def tokenise(self, text: str) -> list[str]:
        """
        Tokenise text into a list of tokens.
        Args:
            text: The text to tokenise.
        Returns:
            List of tokens.
        """
        ...
 class WordTokeniser:
    """
    Word-level tokeniser with Unicode normalisation.
    Splits text into words, with options for lowercasing and punctuation removal.
    """
    def __init__(
        self,
        lowercase: bool = True,
        remove_punctuation: bool = True,
        normalisation_form: NormalisationForm = "NFC",
    ) -> None:
        """
        Initialise the tokeniser.
        Args:
            lowercase: Whether to convert tokens to lowercase.
            remove_punctuation: Whether to remove punctuation from tokens.
            normalisation_form: Unicode normalisation form (NFC, NFD, NFKC, NFKD).
        """
        self.lowercase = lowercase
        self.remove_punctuation = remove_punctuation
        self.normalisation_form: NormalisationForm = normalisation_form
        # Pattern for punctuation removal (keeps alphanumeric and Unicode letters)
        self._punctuation_pattern = re.compile(r"[^\w\s]", re.UNICODE)
    def tokenise(self, text: str) -> list[str]:
        """
        Tokenise text into words.
        Args:
            text: The text to tokenise.
        Returns:
            List of word tokens. Empty list if text is empty or whitespace-only.
        """
        if not text or not text.strip():
            return []
        # Unicode normalisation
        normalised = unicodedata.normalize(self.normalisation_form, text)
        # Lowercase if requested
        if self.lowercase:
            normalised = normalised.lower()
        # Remove punctuation if requested
        if self.remove_punctuation:
            normalised = self._punctuation_pattern.sub(" ", normalised)
        # Split on whitespace and filter empty strings
        tokens = normalised.split()
        return tokens