Source code for py_entitymatching.feature.addfeatures

"""
This module contains functions to add a feature to feature table.
"""
import logging

import pandas as pd
import six

from py_entitymatching.utils.validation_helper import validate_object_type

logger = logging.getLogger(__name__)


[docs]def get_feature_fn(feature_string, tokenizers, similarity_functions):
    """
    This function creates a feature in a declarative manner.

    Specifically, this function uses the feature string, parses it and
    compiles it into a function using the given tokenizers and similarity
    functions. This compiled function will take in two tuples and return a
    feature value (typically a number).

    Args:
        feature_string (string): A feature expression
            to be converted into a function.
        tokenizers (dictionary): A Python dictionary containing tokenizers.
            Specifically, the dictionary contains tokenizer names as keys and
            tokenizer functions as values. The tokenizer function typically
            takes in a string and returns a list of tokens.
        similarity_functions (dictionary): A Python dictionary containing
            similarity functions. Specifically, the dictionary contains
            similarity function names as keys and similarity functions as
            values. The similarity function typically
            takes in a string or two lists of tokens and returns a number.

    Returns:

        This function returns a Python dictionary which contains sufficient
        information (such as attributes, tokenizers, function code) to be added
        to the feature table.

        Specifically the Python dictionary contains the following keys:
        'left_attribute', 'right_attribute',
        'left_attr_tokenizer',
        'right_attr_tokenizer', 'simfunction', 'function', and
        'function_source'.

        For all the keys except the 'function' and 'function_source' the
        value will be either a valid string (if the input feature string is
        parsed correctly) or PARSE_EXP (if the parsing was not successful).
        The 'function' will have a valid Python function as value,
        and 'function_source' will have the Python function's source in
        string format.

        The created function is a self-contained function
        which means that the tokenizers and sim functions that it calls are
        bundled along with the returned function code.

    Raises:
        AssertionError: If `feature_string` is not of type string.
        AssertionError: If the input `tokenizers` is not of type
            dictionary.
        AssertionError: If the input `similarity_functions` is not of
            type dictionary.

    Examples:
        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')

        >>> block_t = em.get_tokenizers_for_blocking()
        >>> block_s = em.get_sim_funs_for_blocking()
        >>> block_f = em.get_features_for_blocking(A, B)
        >>> r = get_feature_fn('jaccard(qgm_3(ltuple.name), qgm_3(rtuple.name)', block_t, block_s)
        >>> em.add_feature(block_f, 'name_name_jac_qgm3_qgm3', r)

        >>> match_t = em.get_tokenizers_for_matching()
        >>> match_s = em.get_sim_funs_for_matching()
        >>> match_f = em.get_features_for_matching(A, B)
        >>> r = get_feature_fn('jaccard(qgm_3(ltuple.name), qgm_3(rtuple.name)', match_t, match_s)
        >>> em.add_feature(match_f, 'name_name_jac_qgm3_qgm3', r)


    See Also:
        :meth:`py_entitymatching.get_sim_funs_for_blocking`,
        :meth:`py_entitymatching.get_tokenizers_for_blocking`,
        :meth:`py_entitymatching.get_sim_funs_for_matching`,
        :meth:`py_entitymatching.get_tokenizers_for_matching`
    """
    # Validate input parameters
    # # We expect the input feature string to be of type string
    validate_object_type(feature_string, six.string_types, error_prefix='Input feature')

    # # We expect the input object tokenizers to be of type python dictionary
    validate_object_type(tokenizers, dict, error_prefix='Input object (tokenizers)')

    # # We expect the input object similarity functions to be of type python
    # dictionary
    validate_object_type(similarity_functions, dict, error_prefix='Input object (similarity_functions)')

    # Initialize a dictionary to have tokenizers/similarity functions
    dict_to_compile = {}
    # Update the dictionary with similarity functions
    if len(similarity_functions) > 0:
        dict_to_compile.update(similarity_functions)
    # Update the dictionary with tokenizers
    if len(tokenizers) > 0:
        dict_to_compile.update(tokenizers)

    # Create a python function string based on the input feature string
    function_string = 'def fn(ltuple, rtuple):\n'
    function_string += '    '
    function_string += 'return ' + feature_string

    # Parse the feature string to get the tokenizer, sim. function, and the
    # attribute that it is operating on
    parsed_dict = _parse_feat_str(feature_string, tokenizers,
                                  similarity_functions)

    # Compile the function string using the constructed dictionary
    six.exec_(function_string, dict_to_compile)

    # Update the parsed dict with the function and the function source
    parsed_dict['function'] = dict_to_compile['fn']
    parsed_dict['function_source'] = function_string

    # Finally, return the parsed dictionary
    return parsed_dict


# parse input feature string
def _parse_feat_str(feature_string, tokenizers, similarity_functions):
    """
    This function parses the feature string to get left attribute,
    right attribute, tokenizer, similarity function
    """
    # Validate the input parameters
    # # We expect the input feature string to be of type string
    validate_object_type(feature_string, six.string_types, error_prefix='Input feature')

    # # We expect the input object tokenizers to be of type python dictionary
    validate_object_type(tokenizers, dict, error_prefix='Input object (tokenizers)')

    # # We expect the input object similarity functions to be of type python
    # dictionary
    validate_object_type(similarity_functions, dict, error_prefix='Input object (similarity_functions)')

    # We will have to parse the feature string. Specifically we use pyparsing
    #  module for the parsing purposes

    from pyparsing import Word, alphanums, ParseException

    # initialization attributes, tokenizers and similarity function parsing
    # result
    left_attribute = 'PARSE_EXP'
    right_attribute = 'PARSE_EXP'
    left_attr_tokenizer = 'PARSE_EXP'
    right_attr_tokenizer = 'PARSE_EXP'
    sim_function = 'PARSE_EXP'

    exception_flag = False

    # Define structures for each type such as attribute name, tokenizer
    # function
    attr_name = Word(alphanums + "_" + "." + "[" + "]" + '"' + "'")
    tok_fn = Word(alphanums + "_") + "(" + attr_name + ")"
    wo_tok = Word(alphanums + "_") + "(" + attr_name + "," + attr_name + ")"
    wi_tok = Word(alphanums + "_") + "(" + tok_fn + "," + tok_fn + ")"
    feat = wi_tok | wo_tok
    # Try to parse the string
    try:
        parsed_string = feat.parseString(feature_string)
    except ParseException as _:
        exception_flag = True

    if not exception_flag:
        # Parse the tokenizers
        parsed_tokenizers = [value for value in parsed_string
                             if value in tokenizers.keys()]
        if len(parsed_tokenizers) is 2:
            left_attr_tokenizer = parsed_tokenizers[0]
            right_attr_tokenizer = parsed_tokenizers[1]

        # Parse the similarity functions
        parsed_similarity_function = [value for value in parsed_string
                                      if value in similarity_functions.keys()]
        if len(parsed_similarity_function) == 1:
            sim_function = parsed_similarity_function[0]
        # Parse the left attribute
        attribute = [value for value in parsed_string
                     if value.startswith('ltuple[')]
        if len(attribute) == 1:
            attribute = attribute[0]
            left_attribute = attribute[7:len(attribute) - 1].strip('"').strip(
                "'")

        # Parse the right attribute
        attribute = [val for val in parsed_string if val.startswith('rtuple[')]
        if len(attribute) == 1:
            attribute = attribute[0]
            right_attribute = attribute[7:len(attribute) - 1].strip('"').strip(
                "'")
    else:
        pass

    # Return the parsed information in a dictionary format.

    parsed_dict = {'left_attribute': left_attribute,
                   'right_attribute': right_attribute,
                   'left_attr_tokenizer': left_attr_tokenizer,
                   'right_attr_tokenizer': right_attr_tokenizer,
                   'simfunction': sim_function,
                   'is_auto_generated': False}

    return parsed_dict


[docs]def add_feature(feature_table, feature_name, feature_dict):
    """
    Adds a feature to the feature table.

    Specifically, this function is used in combination with
    :meth:`~py_entitymatching.get_feature_fn`.
    First the user creates a dictionary using :meth:`~py_entitymatching.get_feature_fn`,
    then the user uses this function to add feature_dict to the feature table.

    Args:
        feature_table (DataFrame): A DataFrame containing features.
        feature_name (string): The name that should be given to the feature.
        feature_dict (dictionary): A Python dictionary, that is typically
            returned by executing :meth:`~py_entitymatching.get_feature_fn`.

    Returns:
        A Boolean value of True is returned if the addition was successful.

    Raises:
        AssertionError: If the input `feature_table` is not of type
            pandas DataFrame.
        AssertionError: If `feature_name` is not of type
            string.
        AssertionError: If `feature_dict` is not of type
            Python dictionary.
        AssertionError: If the `feature_table` does not have necessary columns
            such as 'feature_name', 'left_attribute', 'right_attribute',
            'left_attr_tokenizer',
            'right_attr_tokenizer', 'simfunction', 'function', and
            'function_source' in the DataFrame.
        AssertionError: If the `feature_name` is already present in the feature
            table.

    Examples:
        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')

        >>> block_t = em.get_tokenizers_for_blocking()
        >>> block_s = em.get_sim_funs_for_blocking()
        >>> block_f = em.get_features_for_blocking(A, B)
        >>> r = get_feature_fn('jaccard(qgm_3(ltuple.name), qgm_3(rtuple.name)', block_t, block_s)
        >>> em.add_feature(block_f, 'name_name_jac_qgm3_qgm3', r)

        >>> match_t = em.get_tokenizers_for_matching()
        >>> match_s = em.get_sim_funs_for_matching()
        >>> match_f = em.get_features_for_matching(A, B)
        >>> r = get_feature_fn('jaccard(qgm_3(ltuple.name), qgm_3(rtuple.name)', match_t, match_s)
        >>> em.add_feature(match_f, 'name_name_jac_qgm3_qgm3', r)
    """
    # Validate input parameters
    # # We expect the feature_table to be of pandas DataFrame
    validate_object_type(feature_table, pd.DataFrame, 'Input feature table')

    # # We expect the feature_name to be of type string
    validate_object_type(feature_name, six.string_types, 'Input feature name')

    # # We expect the feature_dict to be of type python dictionary
    validate_object_type(feature_dict, dict, 'Input feature dictionary')

    # # We expect the feature table to contain certain columns
    missing_columns = get_missing_column_values(feature_table.columns)
    if missing_columns:
        error_msg = "Feature table does not have all required columns\n The following columns are missing: {0}".format(", ".join(missing_columns))
        raise AssertionError(error_msg)

    feature_names = list(feature_table['feature_name'])
    if feature_name in feature_names:
        logger.error('Input feature name is already present in feature table')
        raise AssertionError(
            'Input feature name is already present in feature table')

    # Add feature to the feature table at last
    feature_dict['feature_name'] = feature_name
    if len(feature_table) > 0:
        feature_table.loc[len(feature_table)] = feature_dict
    else:
        feature_table.columns = ['feature_name', 'left_attribute',
                                 'right_attribute', 'left_attr_tokenizer',
                                 'right_attr_tokenizer', 'simfunction',
                                 'function',
                                 'function_source',
                                 'is_auto_generated']
        feature_table.loc[len(feature_table)] = feature_dict
    # Finally, return True if everything was fine
    return True


def get_missing_column_values(column):
    required_columns_items = ['feature_name', 'left_attribute', 'right_attribute',
                    'left_attr_tokenizer',
                    'right_attr_tokenizer', 'simfunction', 'function',
                    'function_source', 'is_auto_generated']
    return [item for item in required_columns_items if item not in column]


def create_feature_table():
    """
    Creates an empty feature table.
    """
    # Fix the column names
    column_names = ['feature_name', 'left_attribute', 'right_attribute',
                    'left_attr_tokenizer',
                    'right_attr_tokenizer', 'simfunction', 'function',
                    'function_source', 'is_auto_generated']
    # Create a pandas DataFrame using the column names
    feature_table = pd.DataFrame(columns=column_names)

    # Finally, return the feature table
    return feature_table


[docs]def add_blackbox_feature(feature_table, feature_name, feature_function, **kwargs):
    """
    Adds a black box feature to the feature table.

    Args:
        feature_table (DataFrame): The input DataFrame (typically a feature
            table) to which the feature must be added.
        feature_name (string): The name that should be given to the feature.
        feature_function (Python function): A Python function for the black box
            feature.

    Returns:
        A Boolean value of True is returned if the addition was successful.

    Raises:
        AssertionError: If the input `feature_table` is not of type
            DataFrame.
        AssertionError: If the input `feature_name` is not of type
            string.
        AssertionError: If the `feature_table` does not have necessary columns
            such as 'feature_name', 'left_attribute', 'right_attribute',
            'left_attr_tokenizer',
            'right_attr_tokenizer', 'simfunction', 'function', and
            'function_source' in the DataFrame.
        AssertionError: If the `feature_name` is already present in the
            feature table.

    Examples:
        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> block_f = em.get_features_for_blocking(A, B)
        >>> def age_diff(ltuple, rtuple):
        >>>     # assume that the tuples have age attribute and values are valid numbers.
        >>>   return ltuple['age'] - rtuple['age']
        >>> status = em.add_blackbox_feature(block_f, 'age_difference', age_diff)

    """
    # Validate input parameters
    # # We expect the feature_table to be of type pandas DataFrame
    validate_object_type(feature_table, pd.DataFrame, 'Input feature table')

    # # We expect the feature_name to be of type string
    validate_object_type(feature_name, six.string_types, 'Input feature name')

    # Check if the input feature table contains necessary columns
    dummy_feature_table = create_feature_table()
    if sorted(dummy_feature_table.columns) != sorted(feature_table.columns):
        logger.error('Input feature table does not have the necessary columns')
        raise AssertionError(
            'Input feature table does not have the necessary columns')

    # Check if the feature table already contains the given feature name
    feat_names = list(feature_table['feature_name'])
    if feature_name in feat_names:
        logger.error('Input feature name is already present in feature table')
        raise AssertionError(
            'Input feature name is already present in feature table')

    feature_dict = {}
    feature_dict['feature_name'] = feature_name
    feature_dict['function'] = feature_function
    feature_dict['left_attribute'] = kwargs.get('left_attribute')
    feature_dict['right_attribute'] = kwargs.get('right_attribute')
    feature_dict['left_attr_tokenizer'] = kwargs.get('left_attr_tokenizer')
    feature_dict['right_attr_tokenizer'] = kwargs.get('right_attr_tokenizer')
    feature_dict['simfunction'] = kwargs.get('simfunction')
    feature_dict['function_source'] = kwargs.get('function_source')
    feature_dict['is_auto_generated'] = False

    # Add the feature to the feature table as a last entry.
    if len(feature_table) > 0:
        feature_table.loc[len(feature_table)] = feature_dict
    else:
        feature_table.columns = ['feature_name', 'left_attribute',
                                 'right_attribute', 'left_attr_tokenizer',
                                 'right_attr_tokenizer', 'simfunction',
                                 'function',
                                 'function_source', 'is_auto_generated']
        feature_table.loc[len(feature_table)] = feature_dict
    # Finally return True if the addition was successful
    return True