Source code for py_entitymatching.feature.autofeaturegen

"""
This module contains functions for auto feature generation.
"""
import logging

import pandas as pd
import six
from py_entitymatching.utils.validation_helper import validate_object_type

from IPython.display import display

import py_entitymatching as em
import py_entitymatching.feature.attributeutils as au
import py_entitymatching.feature.simfunctions as sim
import py_entitymatching.feature.tokenizers as tok

logger = logging.getLogger(__name__)


[docs]def get_features(ltable, rtable, l_attr_types, r_attr_types,
                 attr_corres, tok_funcs, sim_funcs):
    """
    This function will automatically generate a set of features based on the
    attributes of the input tables.

    Specifically, this function will go through the attribute
    correspondences between the input tables. For each correspondence ,
    it examines the types of the involved attributes, then apply the
    appropriate tokenizers and sim functions to generate all appropriate
    features for this correspondence.

    Args:
        ltable,rtable (DataFrame): The pandas DataFrames for which the
            features must be generated.
        l_attr_types,r_attr_types (dictionary): The attribute types for the
            input DataFrames. Typically this is generated using the
            function 'get_attr_types'.
        attr_corres (dictionary): The attribute correspondences between the
            input DataFrames.
        tok_funcs (dictionary): A Python dictionary containing tokenizer
            functions.
        sim_funcs (dictionary): A Python dictionary containing similarity
            functions.

    Returns:
        A pandas DataFrame containing automatically generated features.
        Specifically, the DataFrame contains the following attributes:
        'feature_name', 'left_attribute', 'right_attribute',
        'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction',
        'function', 'function_source', 'is_auto_generated'.

    Raises:
        AssertionError: If `ltable` is not of type pandas
            DataFrame.
        AssertionError: If `rtable` is not of type pandas
            DataFrame.
        AssertionError: If `l_attr_types` is not of type
            python dictionary.
        AssertionError: If `r_attr_types` is not of type
            python dictionary.
        AssertionError: If `attr_corres` is not of type
            python dictionary.
        AssertionError: If `sim_funcs` is not of type
            python dictionary.
        AssertionError: If `tok_funcs` is not of type
            python dictionary.
        AssertionError: If the `ltable` and `rtable` order is same as mentioned
            in the `l_attr_types`/`r_attr_types` and `attr_corres`.

    Examples:

        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> match_t = em.get_tokenizers_for_matching()
        >>> match_s = em.get_sim_funs_for_matching()
        >>> atypes1 = em.get_attr_types(A) # don't need, if atypes1 exists from blocking step
        >>> atypes2 = em.get_attr_types(B) # don't need, if atypes2 exists from blocking step
        >>> match_c = em.get_attr_corres(A, B)
        >>> match_f = em.get_features(A, B, atypes1, atype2, match_c, match_t, match_s)

    See Also:
     :meth:`py_entitymatching.get_attr_corres`, :meth:`py_entitymatching.get_attr_types`,
     :meth:`py_entitymatching.get_sim_funs_for_blocking`,
     :meth:`py_entitymatching.get_tokenizers_for_blocking`,
     :meth:`py_entitymatching.get_sim_funs_for_matching`,
     :meth:`py_entitymatching.get_tokenizers_for_matching`


    Note:
        In the output DataFrame, two
        attributes demand some explanation: (1)function, and (2)
        is_auto_generated. The function, points to the actual python function
        that implements feature. Specifically, the function takes in two
        tuples (one from each input table) and returns a numeric value. The
        attribute is_auto_generated contains either True or False. The flag
        is True only if the feature is automatically generated by py_entitymatching.
        This is important because this flag is used to make some assumptions
        about the semantics of the similarity function used and use that
        information for scaling purposes.

    """
    # Validate input parameters
    # # We expect the ltable to be of type pandas DataFrame
    validate_object_type(ltable, pd.DataFrame, 'Input ltable')

    # # We expect the rtable to be of type pandas DataFrame
    validate_object_type(rtable, pd.DataFrame, 'Input rtable')

    # # We expect the l_attr_types to be of type python dictionary
    validate_object_type(l_attr_types, dict, 'Input l_attr_types')

    # # We expect the r_attr_types to be of type python dictionary
    validate_object_type(r_attr_types, dict, 'Input r_attr_types')

    # # We expect the attr_corres to be of type python dictionary
    validate_object_type(attr_corres, dict, 'Input attr_corres')

    # # We expect the tok_funcs to be of type python dictionary
    validate_object_type(tok_funcs, dict, 'Input tok_funcs')

    # # We expect the sim_funcs to be of type python dictionary
    validate_object_type(sim_funcs, dict, 'Input sim_funcs')

    # We expect the table order to be same in l/r_attr_types and attr_corres
    if not _check_table_order(ltable, rtable,
                              l_attr_types, r_attr_types, attr_corres):
        logger.error('Table order is different than what is mentioned '
                     'in l/r attr_types and attr_corres')
        raise AssertionError('Table order is different than what is mentioned '
                             'in l/r attr_types and attr_corres')

    # Initialize output feature dictionary list
    feature_dict_list = []

    # Generate features for each attr. correspondence
    for ac in attr_corres['corres']:
        l_attr_type = l_attr_types[ac[0]]
        r_attr_type = r_attr_types[ac[1]]

        # Generate a feature only if the attribute types are same
        if l_attr_type != r_attr_type:
            logger.info('py_entitymatching types: %s type (%s) and %s type (%s) '
                           'are different.'
                           'If you want to set them to be same and '
                           'generate features, '
                           'update output from get_attr_types and '
                           'use get_features command.\n.'
                           % (ac[0], l_attr_type, ac[1], r_attr_type))
            # features_1 = _get_features_for_type(l_attr_type)
            # features_2 = _get_features_for_type(r_attr_type)
            # features = set(features_1).union(features_2)
            continue

        # Generate features
        features = _get_features_for_type(l_attr_type)

        # Convert features to function objects
        fn_objs = _conv_func_objs(features, ac, tok_funcs, sim_funcs)
        # Add the function object to a feature list.
        feature_dict_list.append(fn_objs)

    # Create a feature table
    feature_table = pd.DataFrame(flatten_list(feature_dict_list))
    # Project out only the necessary columns.
    feature_table = feature_table[['feature_name', 'left_attribute',
                                   'right_attribute', 'left_attr_tokenizer',
                                   'right_attr_tokenizer',
                                   'simfunction', 'function',
                                   'function_source', 'is_auto_generated']]
    # Return the feature table.
    return feature_table


[docs]def get_features_for_blocking(ltable, rtable, validate_inferred_attr_types=True):
    """

    This function automatically generates features that can be used for
    blocking purposes.

    Args:
        ltable,rtable (DataFrame): The pandas DataFrames for which the
            features are to be generated.
        validate_inferred_attr_types (boolean): A flag to indicate whether to 
            show the user the inferred attribute types and the features
            chosen for those types.

    Returns:
        A pandas DataFrame containing automatically generated features.

        Specifically, the DataFrame contains the following attributes:
        'feature_name', 'left_attribute', 'right_attribute',
        'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction',
        'function', 'function_source', and 'is_auto_generated'.


        Further, this function also sets the following global variables:
        _block_t, _block_s, _atypes1, _atypes2, and _block_c.

        The variable _block_t contains the tokenizers used and  _block_s
        contains the similarity functions used for creating features.

        The variables _atypes1, and  _atypes2 contain the attribute types for
        ltable and rtable respectively. The variable _block_c contains the
        attribute correspondences between the two input tables.

    Raises:
        AssertionError: If `ltable` is not of type pandas
            DataFrame.
        AssertionError: If `rtable` is not of type pandas
            DataFrame.
        AssertionError: If `validate_inferred_attr_types` is not of type
            pandas DataFrame.

    Examples:
        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> block_f = em.get_features_for_blocking(A, B)

    Note:
        In the output DataFrame, two
        attributes demand some explanation: (1) function, and (2)
        is_auto_generated. The function, points to the actual Python function
        that implements the feature. Specifically, the function takes in two
        tuples (one from each input table) and returns a numeric value. The
        attribute is_auto_generated contains either True or False. The flag
        is True only if the feature is automatically generated by py_entitymatching.
        This is important because this flag is used to make some assumptions
        about the semantics of the similarity function used and use that
        information for scaling purposes.

    See Also:
     :meth:`py_entitymatching.get_attr_corres`, :meth:`py_entitymatching.get_attr_types`,
     :meth:`py_entitymatching.get_sim_funs_for_blocking`
     :meth:`py_entitymatching.get_tokenizers_for_blocking`

    """
    # Validate input parameters
    # # We expect the ltable to be of type pandas DataFrame
    validate_object_type(ltable, pd.DataFrame, 'Input table A')

    # # We expect the rtable to be of type pandas DataFrame
    validate_object_type(rtable, pd.DataFrame, 'Input table B')

    # # We expect the validate_inferred_attr_types to be of type boolean
    validate_object_type(validate_inferred_attr_types, bool, 'Validate inferred attribute type')

    # Get the similarity functions to be used for blocking
    sim_funcs = sim.get_sim_funs_for_blocking()
    # Get the tokenizers to be used for blocking
    tok_funcs = tok.get_tokenizers_for_blocking()

    # Get the attr. types for ltable and rtable
    attr_types_ltable = au.get_attr_types(ltable)
    attr_types_rtable = au.get_attr_types(rtable)
    # Get the attr. correspondences between ltable and rtable
    attr_corres = au.get_attr_corres(ltable, rtable)
    
    # Show the user inferred attribute types and features and request
    # user permission to proceed
    if validate_inferred_attr_types:
        # if the user does not want to proceed, then exit the function
        if validate_attr_types(attr_types_ltable, attr_types_rtable, attr_corres) is None:
            return

    # Get features based on attr types, attr correspondences, sim functions
    # and tok. functions
    feature_table = get_features(ltable, rtable, attr_types_ltable,
                                 attr_types_rtable, attr_corres,
                                 tok_funcs, sim_funcs)

    # Export important variables to global name space
    em._block_t = tok_funcs
    em._block_s = sim_funcs
    em._atypes1 = attr_types_ltable
    em._atypes2 = attr_types_rtable
    em._block_c = attr_corres
    # Return the feature table
    return feature_table


[docs]def get_features_for_matching(ltable, rtable, validate_inferred_attr_types=True):
    """
    This function automatically generates features that can be used for
    matching purposes.

    Args:
        ltable,rtable (DataFrame): The pandas DataFrames for which the
            features are to be generated.
        validate_inferred_attr_types (boolean): A flag to indicate whether to 
            show the user the inferred attribute types and the features
            chosen for those types.

    Returns:
        A pandas DataFrame containing automatically generated features.

        Specifically, the DataFrame contains the following attributes:
        'feature_name', 'left_attribute', 'right_attribute',
        'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction',
        'function', 'function_source', and 'is_auto_generated'.


        Further, this function also sets the following global variables:
        _match_t, _match_s, _atypes1, _atypes2, and _match_c.

        The variable _match_t contains the tokenizers used and  _match_s
        contains the similarity functions used for creating features.

        The variables _atypes1, and  _atypes2 contain the attribute types for
        ltable and rtable respectively. The variable _match_c contains the
        attribute correspondences between the two input tables.

    Raises:
        AssertionError: If `ltable` is not of type pandas
            DataFrame.
        AssertionError: If `rtable` is not of type pandas
            DataFrame.
        AssertionError: If `validate_inferred_attr_types` is not of type
            pandas DataFrame.

    Examples:
        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> match_f = em.get_features_for_matching(A, B)

    Note:
        In the output DataFrame, two
        attributes demand some explanation: (1) function, and (2)
        is_auto_generated. The function, points to the actual Python function
        that implements the feature. Specifically, the function takes in two
        tuples (one from each input table) and returns a numeric value. The
        attribute is_auto_generated contains either True or False. The flag
        is True only if the feature is automatically generated by py_entitymatching.
        This is important because this flag is used to make some assumptions
        about the semantics of the similarity function used and use that
        information for scaling purposes.

    See Also:
     :meth:`py_entitymatching.get_attr_corres`, :meth:`py_entitymatching.get_attr_types`,
     :meth:`py_entitymatching.get_sim_funs_for_matching`
     :meth:`py_entitymatching.get_tokenizers_for_matching`

    """
    # Validate input parameters
    # # We expect the ltable to be of type pandas DataFrame
    validate_object_type(ltable, pd.DataFrame, 'Input table A')

    # # We expect the rtable to be of type pandas DataFrame
    validate_object_type(rtable, pd.DataFrame, 'Input table B')

    # # We expect the validate_inferred_attr_types to be of type boolean
    validate_object_type(validate_inferred_attr_types, bool, 'Validate inferred attribute type')

    # Get similarity functions for generating the features for matching
    sim_funcs = sim.get_sim_funs_for_matching()
    # Get tokenizer functions for generating the features for matching
    tok_funcs = tok.get_tokenizers_for_matching()

    # Get the attribute types of the input tables
    attr_types_ltable = au.get_attr_types(ltable)
    attr_types_rtable = au.get_attr_types(rtable)

    # Get the attribute correspondence between the input tables
    attr_corres = au.get_attr_corres(ltable, rtable)

    # Show the user inferred attribute types and features and request
    # user permission to proceed
    if validate_inferred_attr_types:
        # if the user does not want to proceed, then exit the function
        if validate_attr_types(attr_types_ltable, attr_types_rtable, attr_corres) is None:
            return

    # Get the features
    feature_table = get_features(ltable, rtable, attr_types_ltable,
                                 attr_types_rtable, attr_corres,
                                 tok_funcs, sim_funcs)

    # Export important variables to global name space
    em._match_t = tok_funcs
    em._match_s = sim_funcs
    em._atypes1 = attr_types_ltable
    em._atypes2 = attr_types_rtable
    em._match_c = attr_corres

    # Finally return the feature table
    return feature_table


#
def _check_table_order(ltable, rtable, l_attr_types, r_attr_types, attr_corres):
    """
    Check whether the order of tables matches with what is mentioned in
    l_attr_types, r_attr_type and attr_corres.
    """
    # Validate the input parameters
    # We expect the input object ltable to be of type pandas DataFrame
    validate_object_type(ltable, pd.DataFrame, 'Input left table')

    # # We expect the rtable to be of type pandas DataFrame
    validate_object_type(rtable, pd.DataFrame, 'Input right table')

    # Get the ids of the input tables. This is used to validate the order
    # of tables present in the given data structures.
    # Note: This kind of checking is bit too aggressive, the reason is this
    # checking needs the ltable and rtable to point to exact memory location
    # across the given dictionaries and the input. Ideally, we just need to
    # check whether the contents of those DataFrames are same.
    ltable_id = id(ltable)
    rtable_id = id(rtable)

    # Check whether ltable id matches with id of table mentioned in l_attr_types
    if ltable_id != id(l_attr_types['_table']):
        logger.error(
            'ltable is not the same as table mentioned in left attr types')
        return False

    # Check whether rtable id matches with id of table mentioned in r_attr_types
    if rtable_id != id(r_attr_types['_table']):
        logger.error(
            'rtable is not the same as table mentioned in right attr types')
        return False

    # Check whether ltable matches with ltable mentioned in attr_corres
    if ltable_id != id(attr_corres['ltable']):
        logger.error(
            'ltable is not the same as table mentioned in attr correspondence')
        return False

    # Check whether rtable matches with rtable mentioned in attr_corres
    if rtable_id != id(attr_corres['rtable']):
        logger.error(
            'rtable is not the same as table mentioned in attr correspondence')
        return False

    # Finally, return True.
    return True


# get look up table to generate features
def _get_feat_lkp_tbl():
    """
    This function embeds the knowledge of mapping what features to be
    generated for what kind of attr. types.

    """
    # Initialize a lookup table
    lookup_table = dict()

    # Features for type str_eq_1w
    lookup_table['STR_EQ_1W'] = [('lev_dist'), ('lev_sim'), ('jaro'),
                                ('jaro_winkler'),
                                 ('exact_match'),
                                 ('jaccard', 'qgm_3', 'qgm_3')]

    # Features for type str_bt_1w_5w
    lookup_table['STR_BT_1W_5W'] = [('jaccard', 'qgm_3', 'qgm_3'),
                                    ('cosine', 'dlm_dc0', 'dlm_dc0'),
                                    ('jaccard', 'dlm_dc0', 'dlm_dc0'),
                                    ('monge_elkan'), ('lev_dist'), ('lev_sim'),
                                    ('needleman_wunsch'),
                                    ('smith_waterman')]  # dlm_dc0 is the concrete space tokenizer

    # Features for type str_bt_5w_10w
    lookup_table['STR_BT_5W_10W'] = [('jaccard', 'qgm_3', 'qgm_3'),
                                     ('cosine', 'dlm_dc0', 'dlm_dc0'),
                                     ('monge_elkan'), ('lev_dist'), ('lev_sim')]

    # Features for type str_gt_10w
    lookup_table['STR_GT_10W'] = [('jaccard', 'qgm_3', 'qgm_3'),
                                  ('cosine', 'dlm_dc0', 'dlm_dc0')]

    # Features for NUMERIC type
    lookup_table['NUM'] = [('exact_match'), ('abs_norm'), ('lev_dist'),
                           ('lev_sim')]

    # Features for BOOLEAN type
    lookup_table['BOOL'] = [('exact_match')]

    # Features for un determined type
    lookup_table['UN_DETERMINED'] = []

    # Finally, return the lookup table
    return lookup_table


def _get_features_for_type(column_type):
    """
    Get features to be generated for a type
    """
    # First get the look up table
    lookup_table = _get_feat_lkp_tbl()

    # Based on the column type, return the feature functions that should be
    # generated.
    if column_type is 'str_eq_1w':
        features = lookup_table['STR_EQ_1W']
    elif column_type is 'str_bt_1w_5w':
        features = lookup_table['STR_BT_1W_5W']
    elif column_type is 'str_bt_5w_10w':
        features = lookup_table['STR_BT_5W_10W']
    elif column_type is 'str_gt_10w':
        features = lookup_table['STR_GT_10W']
    elif column_type is 'numeric':
        features = lookup_table['NUM']
    elif column_type is 'boolean':
        features = lookup_table['BOOL']
    elif column_type is 'un_determined':
        features = lookup_table['UN_DETERMINED']
    else:
        raise TypeError('Unknown type')
    return features


def get_magellan_str_types():
    """
    This function returns the py_entitymatching types as a list of  strings.
    """

    return ['str_eq_1w', 'str_bt_1w_5w', 'str_bt_5w_10w', 'str_gt_10w',
            'numeric', 'boolean', 'un_determined']


# convert features from look up table to function objects
def _conv_func_objs(features, attributes,
                    tokenizer_functions, similarity_functions):
    """
    Convert features from look up table to function objects
    """
    # We need to check whether the features have allowed tokenizers and
    # similarity functions.

    # # First get the tokenizer and similarity functions list.
    tokenizer_list = tokenizer_functions.keys()
    similarity_functions_list = similarity_functions.keys()

    # # Second get the features that uses only valid tokenizers and
    # similarity functions
    valid_list = [check_valid_tok_sim(feature, tokenizer_list,
                                      similarity_functions_list)
                  for feature in features]

    # Get function as a string and other meta data; finally we will get a
    # list of tuples
    function_tuples = [get_fn_str(input, attributes) for input in valid_list]

    # Convert the function string into a function object
    function_objects = conv_fn_str_to_obj(function_tuples, tokenizer_functions,
                                   similarity_functions)

    return function_objects


# check whether tokenizers and simfunctions are allowed
# inp is of the form ('jaccard', 'qgm_3', 'qgm_3') or ('lev')
def check_valid_tok_sim(inp, simlist, toklist):
    if isinstance(inp, six.string_types):
        inp = [inp]
    assert len(inp) == 1 or len(
        inp) == 3, 'len of feature config should be 1 or 3'
    # check whether the sim function in features is in simlist
    if len(set(inp).intersection(simlist)) > 0:
        return inp
    # check whether the tokenizer in features is in tok list
    if len(set(inp).intersection(toklist)) > 0:
        return inp
    return None


# get function string for a feature
def get_fn_str(inp, attrs):
    if inp:
        args = []
        args.extend(attrs)
        if isinstance(inp, six.string_types) == True:
            inp = [inp]
        args.extend(inp)
        # fill function string from a template
        return fill_fn_template(*args)
    else:
        return None


# fill function template
def fill_fn_template(attr1, attr2, sim_func, tok_func_1=None, tok_func_2=None):
    # construct function string
    s = 'from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers import *\n'
    # get the function name
    fn_name = get_fn_name(attr1, attr2, sim_func, tok_func_1, tok_func_2)
    # proceed with function construction
    fn_st = 'def ' + fn_name + '(ltuple, rtuple):'
    s += fn_st
    s += '\n'

    # add 4 spaces
    s += '    '
    fn_body = 'return '
    if tok_func_1 is not None and tok_func_2 is not None:
        fn_body = fn_body + sim_func + '(' + tok_func_1 + '(' + 'ltuple["' + attr1 + '"]'
        fn_body += '), '
        fn_body = fn_body + tok_func_2 + '(' + 'rtuple["' + attr2 + '"]'
        fn_body = fn_body + ')) '
    else:
        fn_body = fn_body + sim_func + '(' + 'ltuple["' + attr1 + '"], rtuple["' + attr2 + '"])'
    s += fn_body

    return fn_name, attr1, attr2, tok_func_1, tok_func_2, sim_func, s


# construct function name from attrs, tokenizers and sim funcs

# sim_fn_names=['jaccard', 'lev', 'cosine', 'monge_elkan',
#               'needleman_wunsch', 'smith_waterman', 'jaro', 'jaro_winkler',
#               'exact_match', 'rel_diff', 'abs_norm']
def get_fn_name(attr1, attr2, sim_func, tok_func_1=None, tok_func_2=None):
    attr1 = '_'.join(attr1.split())
    attr2 = '_'.join(attr2.split())
    fp = '_'.join([attr1, attr2])
    name_lkp = dict()
    name_lkp["jaccard"] = "jac"
    name_lkp["lev_dist"] = "lev_dist"
    name_lkp["lev_sim"] = "lev_sim"
    name_lkp["cosine"] = "cos"
    name_lkp["monge_elkan"] = "mel"
    name_lkp["needleman_wunsch"] = "nmw"
    name_lkp["smith_waterman"] = "sw"
    name_lkp["jaro"] = "jar"
    name_lkp["jaro_winkler"] = "jwn"
    name_lkp["exact_match"] = "exm"
    name_lkp["abs_norm"] = "anm"
    name_lkp["rel_diff"] = "rdf"
    name_lkp["1"] = "1"
    name_lkp["2"] = "2"
    name_lkp["3"] = "3"
    name_lkp["4"] = "4"
    name_lkp["tok_whitespace"] = "wsp"
    name_lkp["tok_qgram"] = "qgm"
    name_lkp["tok_delim"] = "dlm"

    arg_list = [sim_func, tok_func_1, tok_func_2]
    nm_list = [name_lkp.get(tok, tok) for tok in arg_list if tok]
    sp = '_'.join(nm_list)
    return '_'.join([fp, sp])


# conv function string to function object and return with meta data
def conv_fn_str_to_obj(fn_tup, tok, sim_funcs):
    d_orig = {}
    d_orig.update(tok)
    d_orig.update(sim_funcs)
    d_ret_list = []
    for f in fn_tup:
        d_ret = {}
        name = f[0]
        attr1 = f[1]
        attr2 = f[2]
        tok_1 = f[3]
        tok_2 = f[4]
        simfunction = f[5]
        # exec(f[6] in d_orig)
        six.exec_(f[6], d_orig)
        d_ret['function'] = d_orig[name]
        d_ret['feature_name'] = name
        d_ret['left_attribute'] = attr1
        d_ret['right_attribute'] = attr2
        d_ret['left_attr_tokenizer'] = tok_1
        d_ret['right_attr_tokenizer'] = tok_2
        d_ret['simfunction'] = simfunction
        d_ret['function_source'] = f[6]
        d_ret['is_auto_generated'] = True

        d_ret_list.append(d_ret)
    return d_ret_list


def flatten_list(inp_list):
    return [item for sublist in inp_list for item in sublist]


# Show the user inferred attribute types and features and request
# user permission to proceed
def validate_attr_types(l_attr_types, r_attr_types, attr_corres):

    # # We expect the l_attr_types to be of type dictionary
    validate_object_type(l_attr_types, dict, 'Left table attribute types')

    # # We expect the r_attr_types to be of type dictionary
    validate_object_type(r_attr_types, dict, 'Right table attribute types')

    # # We expect the attr_corres to be of type dictionary
    validate_object_type(attr_corres, dict, 'Left table attribute types')

    corres_features_list = []

    print('The table shows the corresponding attributes along with their respective '
          'types.\nPlease confirm that the information  has been correctly inferred.\n'
          'If you would like to skip this validation process in the future,\nplease set '
          'the flag validate_inferred_attr_types equal to false.')

    # Generate features for each attr. correspondence
    for ac in attr_corres['corres']:
        l_attr_type = l_attr_types[ac[0]]
        r_attr_type = r_attr_types[ac[1]]

        # get human readable type names
        readable_l_type = _get_readable_type_name(l_attr_type)
        readable_r_type = _get_readable_type_name(r_attr_type)

        # List and string of readable feature descriptions
        readable_features = []

        # Generate a feature only if the attribute types are same
        if l_attr_type != r_attr_type:
            logger.info('py_entitymatching types: %s type (%s) and %s type (%s) '
                        'are different.'
                        'If you want to set them to be same and '
                        'generate features, '
                        'update output from get_attr_types and '
                        'use get_features command.\n.'
                        % (ac[0], l_attr_type, ac[1], r_attr_type))
            features = ['N/A']
        else:
            # Generate features
            features = _get_features_for_type(l_attr_type)

        # Change features into more readable format
        for feat in features:
            readable_features.append(_get_readable_feature_name(feat))

        # Change the first 3 values in the list of features into a string
        readable_features_str = "; ".join(readable_features[:2])

        # Add information for each set of corresponding attributes to the list
        # corres_features_list.append([ac[0], readable_l_type, ac[1], readable_r_type, readable_features_str])
        corres_features_list.append([ac[0], ac[1], readable_l_type, readable_r_type, readable_features_str])

    # create and display table for the user
    # Create the pandas dataframe from the lists
    labels = ['Left Attribute', 'Right Attribute', 'Left Attribute Type', 'Right Attribute Type',
              'Example Features']
    corres_feat_df = pd.DataFrame(corres_features_list, columns=labels)

    # display the pandas dataframe
    display(corres_feat_df)

    # Ask user if the inferred types are satisfactory. Repeat until satisfactory answer is reached
    while True:
        response = six.moves.input('Do you want to proceed? (y/n):')
        if response == 'y':
            return corres_feat_df
        elif response == 'n':
            print('\nIf the attribute correspondences or types have been inferred incorrectly,\n'
                  'use the get_features() function with your  own correspondences and attribute\n'
                  'types to get the correct features for your data')
            return None
        else:
            print("You must answer with either 'y' or 'n'")



# get look up table to generate readable type names
def _get_type_name_lkp_tbl():
    # Initialize a lookup table
    lookup_table = dict()

    # Map type names to more human readable names
    lookup_table['str_eq_1w'] = 'short string (1 word)'
    lookup_table['str_bt_1w_5w'] = 'short string (1 word to 5 words)'
    lookup_table['str_bt_5w_10w'] = 'medium string (5 words to 10 words)'
    lookup_table['str_gt_10w'] = 'short string (1 word)'
    lookup_table['numeric'] = 'numeric'
    lookup_table['boolean'] = 'boolean'
    lookup_table['un_determined'] = 'un-determined type'

    return lookup_table


# Get readable names for the types
def _get_readable_type_name(column_type):

    # First get the look up table
    lookup_table = _get_type_name_lkp_tbl()

    # Check if the column type is in the dictionary
    if column_type in lookup_table:
        return lookup_table[column_type]
    else:
        raise TypeError('Unknown type')


# get look up table to generate readable feature descriptions
def _get_feature_name_lkp_tbl():

    # Initialize a lookup table
    lookup_table = dict()

    # Map features to more human readable descriptions
    lookup_table['lev_dist'] = 'Levenshtein Distance'
    lookup_table['lev_sim'] = 'Levenshtein Similarity'
    lookup_table['jaro'] = 'Jaro Distance'
    lookup_table['jaro_winkler'] = 'Jaro-Winkler Distance'
    lookup_table['exact_match'] = 'Exact Match'
    lookup_table['needleman_wunsch'] = 'Needleman-Wunsch Algorithm'
    lookup_table['smith_waterman'] = 'Smith-Waterman Algorithm'
    lookup_table['abs_norm'] = 'Absolute Norm'
    lookup_table['jaccard'] = 'Jaccard Similarity'
    lookup_table['monge_elkan'] = 'Monge-Elkan Algorithm'
    lookup_table['cosine'] = 'Cosine Similarity'
    lookup_table['qgm_1'] = "1-grams"
    lookup_table['qgm_2'] = "2-grams"
    lookup_table['qgm_3'] = "3-grams"
    lookup_table['qgm_4'] = "4-grams"
    lookup_table['dlm_dc0'] = 'Space Delimiter'
    lookup_table['dlm_wsp'] = 'Whitespace Delimiter'
    lookup_table['N/A'] = 'Not Applicable: Types do not match'

    return lookup_table


# Get readable names for the features
def _get_readable_feature_name(feature):

    # First get the look up table
    lookup_table = _get_feature_name_lkp_tbl()

    readable_feature = []

    if isinstance(feature, six.string_types):
        # If feature is just a string, return the readable name
        if feature in lookup_table:
            return lookup_table[feature]
        else:
            raise AssertionError('Feature is not present in lookup table')
    elif len(feature) == 3:
        # If feature is a list, get the readable name of each part
        for name in feature:
            # Check if the feature is in the dictionary
            if name in lookup_table:
                readable_feature.append(lookup_table[name])
            else:
                raise AssertionError('Feature is not present in lookup table')
        return readable_feature[0] + ' [' + readable_feature[1] + ', ' + readable_feature[2] + "]"
    else:
        raise AssertionError('Features should have either 0 or 2 (one for each table) tokenizers')