Source code for string2string.misc.default_tokenizer

"""
This file contains the default tokenizer.
"""

from typing import List

# Tokenizer class
[docs]class Tokenizer: """ This class contains the tokenizer. """
[docs] def __init__(self, word_delimiter: str = " ", ): """ Initializes the Tokenizer class. Arguments: word_delimiter (str): The word delimiter. Default is " ". """ # Set the word delimiter self.word_delimiter = word_delimiter
# Tokenize
[docs] def tokenize(self, text: str, ) -> List[str]: """ Returns the tokens from a string. Arguments: text (str): The text to tokenize. Returns: List[str]: The tokens. """ return text.split(self.word_delimiter)
# Detokenize
[docs] def detokenize(self, tokens: List[str], ) -> str: """ Returns the string from a list of tokens. Arguments: tokens (List[str]): The tokens. Returns: str: The string. """ return self.word_delimiter.join(tokens)