feat(core): add tokenisation with unicode support
Implement Tokeniser protocol and WordTokeniser class with NFC Unicode normalisation, optional lowercasing, and punctuation removal.
This commit is contained in:
81
src/veritext/core/tokenisation.py
Normal file
81
src/veritext/core/tokenisation.py
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
"""Tokenisation utilities for text processing."""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
from typing import Literal, Protocol
|
||||||
|
|
||||||
|
NormalisationForm = Literal["NFC", "NFD", "NFKC", "NFKD"]
|
||||||
|
|
||||||
|
|
||||||
|
class Tokeniser(Protocol):
|
||||||
|
"""Protocol for text tokenisers."""
|
||||||
|
|
||||||
|
def tokenise(self, text: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Tokenise text into a list of tokens.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text to tokenise.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of tokens.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
class WordTokeniser:
|
||||||
|
"""
|
||||||
|
Word-level tokeniser with Unicode normalisation.
|
||||||
|
|
||||||
|
Splits text into words, with options for lowercasing and punctuation removal.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
lowercase: bool = True,
|
||||||
|
remove_punctuation: bool = True,
|
||||||
|
normalisation_form: NormalisationForm = "NFC",
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Initialise the tokeniser.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lowercase: Whether to convert tokens to lowercase.
|
||||||
|
remove_punctuation: Whether to remove punctuation from tokens.
|
||||||
|
normalisation_form: Unicode normalisation form (NFC, NFD, NFKC, NFKD).
|
||||||
|
"""
|
||||||
|
self.lowercase = lowercase
|
||||||
|
self.remove_punctuation = remove_punctuation
|
||||||
|
self.normalisation_form: NormalisationForm = normalisation_form
|
||||||
|
|
||||||
|
# Pattern for punctuation removal (keeps alphanumeric and Unicode letters)
|
||||||
|
self._punctuation_pattern = re.compile(r"[^\w\s]", re.UNICODE)
|
||||||
|
|
||||||
|
def tokenise(self, text: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Tokenise text into words.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text to tokenise.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of word tokens. Empty list if text is empty or whitespace-only.
|
||||||
|
"""
|
||||||
|
if not text or not text.strip():
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Unicode normalisation
|
||||||
|
normalised = unicodedata.normalize(self.normalisation_form, text)
|
||||||
|
|
||||||
|
# Lowercase if requested
|
||||||
|
if self.lowercase:
|
||||||
|
normalised = normalised.lower()
|
||||||
|
|
||||||
|
# Remove punctuation if requested
|
||||||
|
if self.remove_punctuation:
|
||||||
|
normalised = self._punctuation_pattern.sub(" ", normalised)
|
||||||
|
|
||||||
|
# Split on whitespace and filter empty strings
|
||||||
|
tokens = normalised.split()
|
||||||
|
|
||||||
|
return tokens
|
||||||
Reference in New Issue
Block a user