import json from pathlib import Path import numpy as np from tokenizers import Tokenizer from tokenizers.normalizers import Sequence class BaseTokenizer: """Abstract Tokenizer class""" @property def vocab_size(self) -> int: """Returns the vocabulary size of the tokenizer :return: Vocabulary size of tokenizer :rtype: int """ raise NotImplementedError @property def config(self) -> dict: """The tokenizer config :return: dictionary of tokenizer config :rtype: dict """ raise NotImplementedError def save(self, path: str | Path) -> None: """Saves the configuration of the tokenizer Saves these files: - tokenizer.json: saved configuration of the tokenizer :param path: Path to save the tokenizer to :type path: str, :class:`pathlib.Path` """ if isinstance(path, str): path = Path(path) with open(path, "w") as fp: json.dump(self.config, fp) @classmethod def load(cls, path: str | Path) -> "BaseTokenizer": """Returns a :class:`bm25_engine.tokenizer.BaseTokenizer` object from saved configuration Requires these files: - tokenizer.json: saved configuration of the tokenizer :param path: Path to load the tokenizer from :type path: str, :class:`pathlib.Path` :returns: Configured BaseTokenizer :rtype: BaseTokenizer """ if isinstance(path, str): path = Path(path) with open(path) as fp: config = json.load(fp) return cls(**config) def tokenize(self, texts: str | list[str], pad: bool = True) -> np.ndarray: raise NotImplementedError class HashTokenizer(BaseTokenizer): def __init__(self) -> None: super().__init__() class PretrainedTokenizer(BaseTokenizer): """Wrapper for HuggingFace tokenizers, representing a pretrained tokenizer (i.e. bert-base-uncased). Extends the :class:`semantic_router.tokenizers.BaseTokenizer` class. :param tokenizer: Binding for HuggingFace Rust tokenizers :type tokenizer: class:`tokenizers.Tokenizer` :param add_special_tokens: Whether to accept special tokens from the tokenizer (i.e. `[PAD]`) :type add_special_tokens: bool :param pad: Whether to pad the input to a consistent length (using `[PAD]` tokens) :type pad: bool :param model_ident: HuggingFace ID of the model (i.e. `bert-base-uncased`) :type model_ident: str """ tokenizer: Tokenizer add_special_tokens: bool pad: bool model_ident: str def __init__( self, model_ident: str, custom_normalizer: Sequence | None = None, add_special_tokens: bool = False, pad: bool = True, ) -> None: """Constructor method""" super().__init__() self.add_special_tokens = add_special_tokens self.model_ident = model_ident self.tokenizer = Tokenizer.from_pretrained(model_ident) self.pad = pad if custom_normalizer: self.tokenizer.normalizer = custom_normalizer # type: ignore if pad: self.tokenizer.enable_padding(direction="right", pad_id=0) @property def vocab_size(self): """Returns the vocabulary size of the tokenizer :return: Vocabulary size of tokenizer :rtype: int """ return self.tokenizer.get_vocab_size() @property def config(self) -> dict: """The tokenizer config :return: dictionary of tokenizer config :rtype: dict """ return { "model_ident": self.model_ident, "add_special_tokens": self.add_special_tokens, "pad": self.pad, } def tokenize(self, texts: str | list[str], pad: bool = True) -> np.ndarray: """Tokenizes a string or list of strings into a 2D :class:`numpy.ndarray` of token ids :param texts: Texts to be tokenized :type texts: str, list :param pad: unused here (configured in the constructor) :type pad: bool :return: 2D numpy array representing token ids :rtype: class:`numpy.ndarray` """ if isinstance(texts, str): texts = [texts] encodings = self.tokenizer.encode_batch_fast( texts, add_special_tokens=self.add_special_tokens ) return np.array([e.ids for e in encodings]) class TokenizerFactory: """Tokenizer factory class""" @staticmethod def get(type_: str, **tokenizer_kwargs) -> BaseTokenizer: r"""Get a configured :class:`bm25_engine.tokenizer.BaseTokenizer` :param type_: Tokenizer type to instantiate :type type_: str :param \**kwargs: kwargs to be passed to Tokenizer constructor :return: Tokenizer :rtype: `bm25_engine.tokenizer.BaseTokenizer` """ match type_: case "pretrained": return PretrainedTokenizer(**tokenizer_kwargs) case _: return PretrainedTokenizer(**tokenizer_kwargs)