Source code for py_entitymatching.feature.tokenizers

# coding=utf-8
"""
This module contains the tokenizer functions supported by py_entitymatching.
"""
import logging

import pandas as pd
import numpy as np
import six

import py_stringmatching as sm
import py_entitymatching.utils.generic_helper as gh

logger = logging.getLogger(__name__)

# Initialize global tokenizers
_global_tokenizers = pd.DataFrame(
    {'function_name': ['tok_qgram', 'tok_delim', 'tok_wspace'],
     'short_name': ['qgm', 'dlm', 'wsp']})


[docs]def get_tokenizers_for_blocking(q=[2, 3], dlm_char=[' ']): """ This function returns the single argument tokenizers that can be used for blocking purposes (typically in rule-based blocking). Args: q (list): The list of integers (i.e q value) for which the q-gram tokenizer must be generated (defaults to [2, 3]). dlm_char (list): The list of characters (i.e delimiter character) for which the delimiter tokenizer must be generated (defaults to [` ']). Returns: A Python dictionary with tokenizer name as the key and tokenizer function as the value. Raises: AssertionError: If both `q` and `dlm_char` are set to None. Examples: >>> import py_entitymatching as em >>> block_t = em.get_tokenizers_for_blocking() >>> block_t = em.get_tokenizers_for_blocking(q=[3], dlm_char=None) >>> block_t = em.get_tokenizers_for_blocking(q=None, dlm_char=[' ']) """ # Validate inputs if q is None and dlm_char is None: logger.error('Both q and dlm_char cannot be null') raise AssertionError('Both q and dlm_char cannot be null') else: # Return single arg tokenizers for the given inputs. return _get_single_arg_tokenizers(q, dlm_char)
[docs]def get_tokenizers_for_matching(q=[2, 3], dlm_char=[' ']): """ This function returns the single argument tokenizers that can be used for matching purposes. Args: q (list): The list of integers (i.e q value) for which the q-gram tokenizer must be generated (defaults to [2, 3]). dlm_char (list): The list of characters (i.e delimiter character) for which the delimiter tokenizer must be generated (defaults to [` ']). Returns: A Python dictionary with tokenizer name as the key and tokenizer function as the value. Raises: AssertionError: If both `q` and `dlm_char` are set to None. Examples: >>> import py_entitymatching as em >>> match_t = em.get_tokenizers_for_blocking() >>> match_t = em.get_tokenizers_for_blocking(q=[3], dlm_char=None) >>> match_t = em.get_tokenizers_for_blocking(q=None, dlm_char=[' ']) """ if q is None and dlm_char is None: logger.error('Both q and dlm_char cannot be null') raise AssertionError('Both q and dlm_char cannot be null') else: # Return single arg tokenizers for the given inputs. return _get_single_arg_tokenizers(q, dlm_char)
def _get_single_arg_tokenizers(q=[2, 3], dlm_char=[' ']): """ This function creates single argument tokenizers for the given input parameters. """ # Validate the input parameters if q is None and dlm_char is None: logger.error('Both q and dlm_char cannot be null') raise AssertionError('Both q and dlm_char cannot be null') # Initialize the key (function names) and value dictionaries (tokenizer # functions). names = [] functions = [] if q is not None: if not isinstance(q, list): q = [q] # Create a qgram function for the given list of q's qgm_fn_list = [_make_tok_qgram(k) for k in q] qgm_names = ['qgm_' + str(x) for x in q] # Update the tokenizer name, function lists names.extend(qgm_names) functions.extend(qgm_fn_list) names.append('wspace') functions.append(tok_wspace) names.append('alphabetic') functions.append(tok_alphabetic) names.append('alphanumeric') functions.append(tok_alphanumeric) if dlm_char is not None: if not isinstance(dlm_char, list) and isinstance(dlm_char, six.string_types): dlm_char = [dlm_char] # Create a delimiter function for the given list of q's dlm_fn_list = [_make_tok_delim(k) for k in dlm_char] # Update the tokenizer name, function lists dlm_names = ['dlm_dc' + str(i) for i in range(len(dlm_char))] names.extend(dlm_names) functions.extend(dlm_fn_list) if len(names) > 0 and len(functions) > 0: return dict(zip(names, functions)) else: logger.warning('Didnot create any tokenizers, returning empty dict.') return dict() def _make_tok_delim(d): """ This function returns a delimiter-based tokenizer with a fixed delimiter """ def tok_delim(s): # check if the input is of type base string if pd.isnull(s): return s # Remove non ascii characters. Note: This should be fixed in the # next version. #s = remove_non_ascii(s) s = gh.convert_to_str_unicode(s) # Initialize the tokenizer measure object measure = sm.DelimiterTokenizer(delim_set=[d]) # Call the function that will tokenize the input string. return measure.tokenize(s) return tok_delim # return a qgram-based tokenizer with a fixed q def _make_tok_qgram(q): """ This function returns a qgran-based tokenizer with a fixed delimiter """ def tok_qgram(s): # check if the input is of type base string if pd.isnull(s): return s s = gh.convert_to_str_unicode(s) measure = sm.QgramTokenizer(qval=q) return measure.tokenize(s) return tok_qgram # q-gram tokenizer
[docs]def tok_qgram(input_string, q): """ This function splits the input string into a list of q-grams. Note that, by default the input strings are padded and then tokenized. Args: input_string (string): Input string that should be tokenized. q (int): q-val that should be used to tokenize the input string. Returns: A list of tokens, if the input string is not NaN, else returns NaN. Examples: >>> import py_entitymatching as em >>> em.tok_qgram('database', q=2) ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$'] >>> em.tok_qgram('database', q=3) ['##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$'] >>> em.tok_qgram(None, q=2) nan """ if pd.isnull(input_string): return np.NaN input_string = gh.convert_to_str_unicode(input_string) measure = sm.QgramTokenizer(qval=q) return measure.tokenize(input_string)
[docs]def tok_delim(input_string, d): """ This function splits the input string into a list of tokens (based on the delimiter). Args: input_string (string): Input string that should be tokenized. d (string): Delimiter string. Returns: A list of tokens, if the input string is not NaN , else returns NaN. Examples: >>> import py_entitymatching as em >>> em.tok_delim('data science', ' ') ['data', 'science'] >>> em.tok_delim('data$#$science', '$#$') ['data', 'science'] >>> em.tok_delim(None, ' ') nan """ if pd.isnull(input_string): return np.NaN input_string = gh.convert_to_str_unicode(input_string) measure = sm.DelimiterTokenizer(delim_set=[d]) return measure.tokenize(input_string)
[docs]def tok_wspace(input_string): """ This function splits the input string into a list of tokens (based on the white space). Args: input_string (string): Input string that should be tokenized. Returns: A list of tokens, if the input string is not NaN , else returns NaN. Examples: >>> import py_entitymatching as em >>> em.tok_wspace('data science') ['data', 'science'] >>> em.tok_wspace('data science') ['data', 'science'] >>> em.tok_wspace(None) nan """ if pd.isnull(input_string): return np.NaN # input_string = remove_non_ascii(input_string) input_string = gh.convert_to_str_unicode(input_string) measure = sm.WhitespaceTokenizer() return measure.tokenize(input_string)
[docs]def tok_alphabetic(input_string): """ This function returns a list of tokens that are maximal sequences of consecutive alphabetical characters. Args: input_string (string): Input string that should be tokenized. Returns: A list of tokens, if the input string is not NaN , else returns NaN. Examples: >>> import py_entitymatching as em >>> em.tok_alphabetic('data99science, data#integration.') ['data', 'science', 'data', 'integration'] >>> em.tok_alphabetic('99') [] >>> em.tok_alphabetic(None) nan """ if pd.isnull(input_string): return np.NaN measure = sm.AlphabeticTokenizer() input_string = gh.convert_to_str_unicode(input_string) return measure.tokenize(input_string)
[docs]def tok_alphanumeric(input_string): """ This function returns a list of tokens that are maximal sequences of consecutive alphanumeric characters. Args: input_string (string): Input string that should be tokenized. Returns: A list of tokens, if the input string is not NaN , else returns NaN. Examples: >>> import py_entitymatching as em >>> em.tok_alphanumeric('data9,(science), data9#.(integration).88') ['data9', 'science', 'data9', 'integration', '88'] >>> em.tok_alphanumeric('#.$') [] >>> em.tok_alphanumeric(None) nan """ if pd.isnull(input_string): return np.NaN input_string = gh.convert_to_str_unicode(input_string) measure = sm.AlphanumericTokenizer() return measure.tokenize(input_string)