from six import string_types
from six.moves import xrange
from py_stringmatching import utils
from py_stringmatching.tokenizer.definition_tokenizer import DefinitionTokenizer
[docs]class QgramTokenizer(DefinitionTokenizer):
"""Returns tokens that are sequences of q consecutive characters.
A qgram of an input string s is a substring t (of s) which is a sequence of q consecutive characters. Qgrams are also known as
ngrams or kgrams.
Args:
qval (int): A value for q, that is, the qgram's length (defaults to 2).
return_set (boolean): A flag to indicate whether to return a set of
tokens or a bag of tokens (defaults to False).
padding (boolean): A flag to indicate whether a prefix and a suffix should be added
to the input string (defaults to True).
prefix_pad (str): A character (that is, a string of length 1 in Python) that should be replicated
(qval-1) times and prepended to the input string, if padding was
set to True (defaults to '#').
suffix_pad (str): A character (that is, a string of length 1 in Python) that should be replicated
(qval-1) times and appended to the input string, if padding was
set to True (defaults to '$').
Attributes:
qval (int): An attribute to store the q value.
return_set (boolean): An attribute to store the flag return_set.
padding (boolean): An attribute to store the padding flag.
prefix_pad (str): An attribute to store the prefix string that should be used for padding.
suffix_pad (str): An attribute to store the suffix string that should
be used for padding.
"""
def __init__(self, qval=2,
padding=True, prefix_pad='#', suffix_pad='$',
return_set=False):
if qval < 1:
raise AssertionError("qval cannot be less than 1")
self.qval = qval
if not type(padding) == type(True):
raise AssertionError('padding is expected to be boolean type')
self.padding = padding
if not isinstance(prefix_pad, string_types):
raise AssertionError('prefix_pad is expected to be of type string')
if not isinstance(suffix_pad, string_types):
raise AssertionError('suffix_pad is expected to be of type string')
if not len(prefix_pad) == 1:
raise AssertionError("prefix_pad should have length equal to 1")
if not len(suffix_pad) == 1:
raise AssertionError("suffix_pad should have length equal to 1")
self.prefix_pad = prefix_pad
self.suffix_pad = suffix_pad
super(QgramTokenizer, self).__init__(return_set)
[docs] def tokenize(self, input_string):
"""Tokenizes input string into qgrams.
Args:
input_string (str): The string to be tokenized.
Returns:
A Python list, which is a set or a bag of qgrams, depending on whether return_set flag is True or False.
Raises:
TypeError : If the input is not a string
Examples:
>>> qg2_tok = QgramTokenizer()
>>> qg2_tok.tokenize('database')
['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']
>>> qg2_tok.tokenize('a')
['#a', 'a$']
>>> qg3_tok = QgramTokenizer(qval=3)
>>> qg3_tok.tokenize('database')
['##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$']
>>> qg3_nopad = QgramTokenizer(padding=False)
>>> qg3_nopad.tokenize('database')
['da', 'at', 'ta', 'ab', 'ba', 'as', 'se']
>>> qg3_diffpads = QgramTokenizer(prefix_pad='^', suffix_pad='!')
>>> qg3_diffpads.tokenize('database')
['^d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e!']
"""
utils.tok_check_for_none(input_string)
utils.tok_check_for_string_input(input_string)
qgram_list = []
# If the padding flag is set to true, add q-1 "prefix_pad" characters
# in front of the input string and add q-1 "suffix_pad" characters at
# the end of the input string.
if self.padding:
input_string = (self.prefix_pad * (self.qval - 1)) + input_string \
+ (self.suffix_pad * (self.qval - 1))
if len(input_string) < self.qval:
return qgram_list
qgram_list = [input_string[i:i + self.qval] for i in
xrange(len(input_string) - (self.qval - 1))]
qgram_list = list(filter(None, qgram_list))
if self.return_set:
return utils.convert_bag_to_set(qgram_list)
return qgram_list
[docs] def get_qval(self):
"""Gets the value of the qval attribute, which is the length of qgrams.
Returns:
The value of the qval attribute.
"""
return self.qval
[docs] def set_qval(self, qval):
"""Sets the value of the qval attribute.
Args:
qval (int): A value for q (the length of qgrams).
Raises:
AssertionError : If qval is less than 1.
"""
if qval < 1:
raise AssertionError("qval cannot be less than 1")
self.qval = qval
return True
[docs] def get_padding(self):
"""
Gets the value of the padding flag. This flag determines whether the
padding should be done for the input strings or not.
Returns:
The Boolean value of the padding flag.
"""
return self.padding
[docs] def set_padding(self, padding):
"""
Sets the value of the padding flag.
Args:
padding (boolean): Flag to indicate whether padding should be
done or not.
Returns:
The Boolean value of True is returned if the update was
successful.
Raises:
AssertionError: If the padding is not of type boolean
"""
if not type(padding) == type(True):
raise AssertionError('padding is expected to be boolean type')
self.padding = padding
return True
[docs] def get_prefix_pad(self):
"""
Gets the value of the prefix pad.
Returns:
The prefix pad string.
"""
return self.prefix_pad
[docs] def set_prefix_pad(self, prefix_pad):
"""
Sets the value of the prefix pad string.
Args:
prefix_pad (str): String that should be prepended to the
input string before tokenization.
Returns:
The Boolean value of True is returned if the update was
successful.
Raises:
AssertionError: If the prefix_pad is not of type string.
AssertionError: If the length of prefix_pad is not one.
"""
if not isinstance(prefix_pad, string_types):
raise AssertionError('prefix_pad is expected to be of type string')
if not len(prefix_pad) == 1:
raise AssertionError("prefix_pad should have length equal to 1")
self.prefix_pad = prefix_pad
return True
[docs] def get_suffix_pad(self):
"""
Gets the value of the suffix pad.
Returns:
The suffix pad string.
"""
return self.suffix_pad
[docs] def set_suffix_pad(self, suffix_pad):
"""
Sets the value of the suffix pad string.
Args:
suffix_pad (str): String that should be appended to the
input string before tokenization.
Returns:
The boolean value of True is returned if the update was
successful.
Raises:
AssertionError: If the suffix_pad is not of type string.
AssertionError: If the length of suffix_pad is not one.
"""
if not isinstance(suffix_pad, string_types):
raise AssertionError('suffix_pad is expected to be of type string')
if not len(suffix_pad) == 1:
raise AssertionError("suffix_pad should have length equal to 1")
self.suffix_pad = suffix_pad
return True