Source code for py_stringmatching.tokenizer.delimiter_tokenizer

import re

from py_stringmatching import utils
from py_stringmatching.tokenizer.tokenizer import Tokenizer


[docs]class DelimiterTokenizer(Tokenizer): """Uses delimiters to find tokens, as apposed to using definitions. Examples of delimiters include white space and punctuations. Examples of definitions include alphabetical and qgram tokens. Args: delim_set (set): A set of delimiter strings (defaults to space delimiter). return_set (boolean): A flag to indicate whether to return a set of tokens instead of a bag of tokens (defaults to False). Attributes: return_set (boolean): An attribute to store the value of the flag return_set. """ def __init__(self, delim_set=set([' ']), return_set=False): self.__delim_set = None self.__use_split = None self.__delim_str = None self.__delim_regex = None self._update_delim_set(delim_set) super(DelimiterTokenizer, self).__init__(return_set)
[docs] def tokenize(self, input_string): """Tokenizes input string based on the set of delimiters. Args: input_string (str): The string to be tokenized. Returns: A Python list which is a set or a bag of tokens, depending on whether return_set flag is set to True or False. Raises: TypeError : If the input is not a string. Examples: >>> delim_tok = DelimiterTokenizer() >>> delim_tok.tokenize('data science') ['data', 'science'] >>> delim_tok = DelimiterTokenizer(['$#$']) >>> delim_tok.tokenize('data$#$science') ['data', 'science'] >>> delim_tok = DelimiterTokenizer([',', '.']) >>> delim_tok.tokenize('data,science.data,integration.') ['data', 'science', 'data', 'integration'] >>> delim_tok = DelimiterTokenizer([',', '.'], return_set=True) >>> delim_tok.tokenize('data,science.data,integration.') ['data', 'science', 'integration'] """ utils.tok_check_for_none(input_string) utils.tok_check_for_string_input(input_string) if self.__use_split: token_list = list(filter(None, input_string.split(self.__delim_str))) else: token_list = list(filter(None, self.__delim_regex.split(input_string))) if self.return_set: return utils.convert_bag_to_set(token_list) return token_list
[docs] def get_delim_set(self): """Gets the current set of delimiters. Returns: A Python set which is the current set of delimiters. """ return self.__delim_set
[docs] def set_delim_set(self, delim_set): """Sets the current set of delimiters. Args: delim_set (set): A set of delimiter strings. """ return self._update_delim_set(delim_set)
def _update_delim_set(self, delim_set): if not isinstance(delim_set, set): delim_set = set(delim_set) self.__delim_set = delim_set # if there is only one delimiter string, use split instead of regex self.__use_split = False if len(self.__delim_set) == 1: self.__delim_str = list(self.__delim_set)[0] self.__use_split = True else: self.__delim_regex = re.compile('|'.join( map(re.escape, self.__delim_set))) return True