"""
This module contains functions for auto feature generation.
"""
import logging
import pandas as pd
import six
from py_entitymatching.utils.validation_helper import validate_object_type
from IPython.display import display
import py_entitymatching as em
import py_entitymatching.feature.attributeutils as au
import py_entitymatching.feature.simfunctions as sim
import py_entitymatching.feature.tokenizers as tok
logger = logging.getLogger(__name__)
[docs]def get_features(ltable, rtable, l_attr_types, r_attr_types,
attr_corres, tok_funcs, sim_funcs):
"""
This function will automatically generate a set of features based on the
attributes of the input tables.
Specifically, this function will go through the attribute
correspondences between the input tables. For each correspondence ,
it examines the types of the involved attributes, then apply the
appropriate tokenizers and sim functions to generate all appropriate
features for this correspondence.
Args:
ltable,rtable (DataFrame): The pandas DataFrames for which the
features must be generated.
l_attr_types,r_attr_types (dictionary): The attribute types for the
input DataFrames. Typically this is generated using the
function 'get_attr_types'.
attr_corres (dictionary): The attribute correspondences between the
input DataFrames.
tok_funcs (dictionary): A Python dictionary containing tokenizer
functions.
sim_funcs (dictionary): A Python dictionary containing similarity
functions.
Returns:
A pandas DataFrame containing automatically generated features.
Specifically, the DataFrame contains the following attributes:
'feature_name', 'left_attribute', 'right_attribute',
'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction',
'function', 'function_source', 'is_auto_generated'.
Raises:
AssertionError: If `ltable` is not of type pandas
DataFrame.
AssertionError: If `rtable` is not of type pandas
DataFrame.
AssertionError: If `l_attr_types` is not of type
python dictionary.
AssertionError: If `r_attr_types` is not of type
python dictionary.
AssertionError: If `attr_corres` is not of type
python dictionary.
AssertionError: If `sim_funcs` is not of type
python dictionary.
AssertionError: If `tok_funcs` is not of type
python dictionary.
AssertionError: If the `ltable` and `rtable` order is same as mentioned
in the `l_attr_types`/`r_attr_types` and `attr_corres`.
Examples:
>>> import py_entitymatching as em
>>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
>>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
>>> match_t = em.get_tokenizers_for_matching()
>>> match_s = em.get_sim_funs_for_matching()
>>> atypes1 = em.get_attr_types(A) # don't need, if atypes1 exists from blocking step
>>> atypes2 = em.get_attr_types(B) # don't need, if atypes2 exists from blocking step
>>> match_c = em.get_attr_corres(A, B)
>>> match_f = em.get_features(A, B, atypes1, atype2, match_c, match_t, match_s)
See Also:
:meth:`py_entitymatching.get_attr_corres`, :meth:`py_entitymatching.get_attr_types`,
:meth:`py_entitymatching.get_sim_funs_for_blocking`,
:meth:`py_entitymatching.get_tokenizers_for_blocking`,
:meth:`py_entitymatching.get_sim_funs_for_matching`,
:meth:`py_entitymatching.get_tokenizers_for_matching`
Note:
In the output DataFrame, two
attributes demand some explanation: (1)function, and (2)
is_auto_generated. The function, points to the actual python function
that implements feature. Specifically, the function takes in two
tuples (one from each input table) and returns a numeric value. The
attribute is_auto_generated contains either True or False. The flag
is True only if the feature is automatically generated by py_entitymatching.
This is important because this flag is used to make some assumptions
about the semantics of the similarity function used and use that
information for scaling purposes.
"""
# Validate input parameters
# # We expect the ltable to be of type pandas DataFrame
validate_object_type(ltable, pd.DataFrame, 'Input ltable')
# # We expect the rtable to be of type pandas DataFrame
validate_object_type(rtable, pd.DataFrame, 'Input rtable')
# # We expect the l_attr_types to be of type python dictionary
validate_object_type(l_attr_types, dict, 'Input l_attr_types')
# # We expect the r_attr_types to be of type python dictionary
validate_object_type(r_attr_types, dict, 'Input r_attr_types')
# # We expect the attr_corres to be of type python dictionary
validate_object_type(attr_corres, dict, 'Input attr_corres')
# # We expect the tok_funcs to be of type python dictionary
validate_object_type(tok_funcs, dict, 'Input tok_funcs')
# # We expect the sim_funcs to be of type python dictionary
validate_object_type(sim_funcs, dict, 'Input sim_funcs')
# We expect the table order to be same in l/r_attr_types and attr_corres
if not _check_table_order(ltable, rtable,
l_attr_types, r_attr_types, attr_corres):
logger.error('Table order is different than what is mentioned '
'in l/r attr_types and attr_corres')
raise AssertionError('Table order is different than what is mentioned '
'in l/r attr_types and attr_corres')
# Initialize output feature dictionary list
feature_dict_list = []
# Generate features for each attr. correspondence
for ac in attr_corres['corres']:
l_attr_type = l_attr_types[ac[0]]
r_attr_type = r_attr_types[ac[1]]
# Generate a feature only if the attribute types are same
if l_attr_type != r_attr_type:
logger.info('py_entitymatching types: %s type (%s) and %s type (%s) '
'are different.'
'If you want to set them to be same and '
'generate features, '
'update output from get_attr_types and '
'use get_features command.\n.'
% (ac[0], l_attr_type, ac[1], r_attr_type))
# features_1 = _get_features_for_type(l_attr_type)
# features_2 = _get_features_for_type(r_attr_type)
# features = set(features_1).union(features_2)
continue
# Generate features
features = _get_features_for_type(l_attr_type)
# Convert features to function objects
fn_objs = _conv_func_objs(features, ac, tok_funcs, sim_funcs)
# Add the function object to a feature list.
feature_dict_list.append(fn_objs)
# Create a feature table
feature_table = pd.DataFrame(flatten_list(feature_dict_list))
# Project out only the necessary columns.
feature_table = feature_table[['feature_name', 'left_attribute',
'right_attribute', 'left_attr_tokenizer',
'right_attr_tokenizer',
'simfunction', 'function',
'function_source', 'is_auto_generated']]
# Return the feature table.
return feature_table
[docs]def get_features_for_blocking(ltable, rtable, validate_inferred_attr_types=True):
"""
This function automatically generates features that can be used for
blocking purposes.
Args:
ltable,rtable (DataFrame): The pandas DataFrames for which the
features are to be generated.
validate_inferred_attr_types (boolean): A flag to indicate whether to
show the user the inferred attribute types and the features
chosen for those types.
Returns:
A pandas DataFrame containing automatically generated features.
Specifically, the DataFrame contains the following attributes:
'feature_name', 'left_attribute', 'right_attribute',
'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction',
'function', 'function_source', and 'is_auto_generated'.
Further, this function also sets the following global variables:
_block_t, _block_s, _atypes1, _atypes2, and _block_c.
The variable _block_t contains the tokenizers used and _block_s
contains the similarity functions used for creating features.
The variables _atypes1, and _atypes2 contain the attribute types for
ltable and rtable respectively. The variable _block_c contains the
attribute correspondences between the two input tables.
Raises:
AssertionError: If `ltable` is not of type pandas
DataFrame.
AssertionError: If `rtable` is not of type pandas
DataFrame.
AssertionError: If `validate_inferred_attr_types` is not of type
pandas DataFrame.
Examples:
>>> import py_entitymatching as em
>>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
>>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
>>> block_f = em.get_features_for_blocking(A, B)
Note:
In the output DataFrame, two
attributes demand some explanation: (1) function, and (2)
is_auto_generated. The function, points to the actual Python function
that implements the feature. Specifically, the function takes in two
tuples (one from each input table) and returns a numeric value. The
attribute is_auto_generated contains either True or False. The flag
is True only if the feature is automatically generated by py_entitymatching.
This is important because this flag is used to make some assumptions
about the semantics of the similarity function used and use that
information for scaling purposes.
See Also:
:meth:`py_entitymatching.get_attr_corres`, :meth:`py_entitymatching.get_attr_types`,
:meth:`py_entitymatching.get_sim_funs_for_blocking`
:meth:`py_entitymatching.get_tokenizers_for_blocking`
"""
# Validate input parameters
# # We expect the ltable to be of type pandas DataFrame
validate_object_type(ltable, pd.DataFrame, 'Input table A')
# # We expect the rtable to be of type pandas DataFrame
validate_object_type(rtable, pd.DataFrame, 'Input table B')
# # We expect the validate_inferred_attr_types to be of type boolean
validate_object_type(validate_inferred_attr_types, bool, 'Validate inferred attribute type')
# Get the similarity functions to be used for blocking
sim_funcs = sim.get_sim_funs_for_blocking()
# Get the tokenizers to be used for blocking
tok_funcs = tok.get_tokenizers_for_blocking()
# Get the attr. types for ltable and rtable
attr_types_ltable = au.get_attr_types(ltable)
attr_types_rtable = au.get_attr_types(rtable)
# Get the attr. correspondences between ltable and rtable
attr_corres = au.get_attr_corres(ltable, rtable)
# Show the user inferred attribute types and features and request
# user permission to proceed
if validate_inferred_attr_types:
# if the user does not want to proceed, then exit the function
if validate_attr_types(attr_types_ltable, attr_types_rtable, attr_corres) is None:
return
# Get features based on attr types, attr correspondences, sim functions
# and tok. functions
feature_table = get_features(ltable, rtable, attr_types_ltable,
attr_types_rtable, attr_corres,
tok_funcs, sim_funcs)
# Export important variables to global name space
em._block_t = tok_funcs
em._block_s = sim_funcs
em._atypes1 = attr_types_ltable
em._atypes2 = attr_types_rtable
em._block_c = attr_corres
# Return the feature table
return feature_table
[docs]def get_features_for_matching(ltable, rtable, validate_inferred_attr_types=True):
"""
This function automatically generates features that can be used for
matching purposes.
Args:
ltable,rtable (DataFrame): The pandas DataFrames for which the
features are to be generated.
validate_inferred_attr_types (boolean): A flag to indicate whether to
show the user the inferred attribute types and the features
chosen for those types.
Returns:
A pandas DataFrame containing automatically generated features.
Specifically, the DataFrame contains the following attributes:
'feature_name', 'left_attribute', 'right_attribute',
'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction',
'function', 'function_source', and 'is_auto_generated'.
Further, this function also sets the following global variables:
_match_t, _match_s, _atypes1, _atypes2, and _match_c.
The variable _match_t contains the tokenizers used and _match_s
contains the similarity functions used for creating features.
The variables _atypes1, and _atypes2 contain the attribute types for
ltable and rtable respectively. The variable _match_c contains the
attribute correspondences between the two input tables.
Raises:
AssertionError: If `ltable` is not of type pandas
DataFrame.
AssertionError: If `rtable` is not of type pandas
DataFrame.
AssertionError: If `validate_inferred_attr_types` is not of type
pandas DataFrame.
Examples:
>>> import py_entitymatching as em
>>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
>>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
>>> match_f = em.get_features_for_matching(A, B)
Note:
In the output DataFrame, two
attributes demand some explanation: (1) function, and (2)
is_auto_generated. The function, points to the actual Python function
that implements the feature. Specifically, the function takes in two
tuples (one from each input table) and returns a numeric value. The
attribute is_auto_generated contains either True or False. The flag
is True only if the feature is automatically generated by py_entitymatching.
This is important because this flag is used to make some assumptions
about the semantics of the similarity function used and use that
information for scaling purposes.
See Also:
:meth:`py_entitymatching.get_attr_corres`, :meth:`py_entitymatching.get_attr_types`,
:meth:`py_entitymatching.get_sim_funs_for_matching`
:meth:`py_entitymatching.get_tokenizers_for_matching`
"""
# Validate input parameters
# # We expect the ltable to be of type pandas DataFrame
validate_object_type(ltable, pd.DataFrame, 'Input table A')
# # We expect the rtable to be of type pandas DataFrame
validate_object_type(rtable, pd.DataFrame, 'Input table B')
# # We expect the validate_inferred_attr_types to be of type boolean
validate_object_type(validate_inferred_attr_types, bool, 'Validate inferred attribute type')
# Get similarity functions for generating the features for matching
sim_funcs = sim.get_sim_funs_for_matching()
# Get tokenizer functions for generating the features for matching
tok_funcs = tok.get_tokenizers_for_matching()
# Get the attribute types of the input tables
attr_types_ltable = au.get_attr_types(ltable)
attr_types_rtable = au.get_attr_types(rtable)
# Get the attribute correspondence between the input tables
attr_corres = au.get_attr_corres(ltable, rtable)
# Show the user inferred attribute types and features and request
# user permission to proceed
if validate_inferred_attr_types:
# if the user does not want to proceed, then exit the function
if validate_attr_types(attr_types_ltable, attr_types_rtable, attr_corres) is None:
return
# Get the features
feature_table = get_features(ltable, rtable, attr_types_ltable,
attr_types_rtable, attr_corres,
tok_funcs, sim_funcs)
# Export important variables to global name space
em._match_t = tok_funcs
em._match_s = sim_funcs
em._atypes1 = attr_types_ltable
em._atypes2 = attr_types_rtable
em._match_c = attr_corres
# Finally return the feature table
return feature_table
#
def _check_table_order(ltable, rtable, l_attr_types, r_attr_types, attr_corres):
"""
Check whether the order of tables matches with what is mentioned in
l_attr_types, r_attr_type and attr_corres.
"""
# Validate the input parameters
# We expect the input object ltable to be of type pandas DataFrame
validate_object_type(ltable, pd.DataFrame, 'Input left table')
# # We expect the rtable to be of type pandas DataFrame
validate_object_type(rtable, pd.DataFrame, 'Input right table')
# Get the ids of the input tables. This is used to validate the order
# of tables present in the given data structures.
# Note: This kind of checking is bit too aggressive, the reason is this
# checking needs the ltable and rtable to point to exact memory location
# across the given dictionaries and the input. Ideally, we just need to
# check whether the contents of those DataFrames are same.
ltable_id = id(ltable)
rtable_id = id(rtable)
# Check whether ltable id matches with id of table mentioned in l_attr_types
if ltable_id != id(l_attr_types['_table']):
logger.error(
'ltable is not the same as table mentioned in left attr types')
return False
# Check whether rtable id matches with id of table mentioned in r_attr_types
if rtable_id != id(r_attr_types['_table']):
logger.error(
'rtable is not the same as table mentioned in right attr types')
return False
# Check whether ltable matches with ltable mentioned in attr_corres
if ltable_id != id(attr_corres['ltable']):
logger.error(
'ltable is not the same as table mentioned in attr correspondence')
return False
# Check whether rtable matches with rtable mentioned in attr_corres
if rtable_id != id(attr_corres['rtable']):
logger.error(
'rtable is not the same as table mentioned in attr correspondence')
return False
# Finally, return True.
return True
# get look up table to generate features
def _get_feat_lkp_tbl():
"""
This function embeds the knowledge of mapping what features to be
generated for what kind of attr. types.
"""
# Initialize a lookup table
lookup_table = dict()
# Features for type str_eq_1w
lookup_table['STR_EQ_1W'] = [('lev_dist'), ('lev_sim'), ('jaro'),
('jaro_winkler'),
('exact_match'),
('jaccard', 'qgm_3', 'qgm_3')]
# Features for type str_bt_1w_5w
lookup_table['STR_BT_1W_5W'] = [('jaccard', 'qgm_3', 'qgm_3'),
('cosine', 'dlm_dc0', 'dlm_dc0'),
('jaccard', 'dlm_dc0', 'dlm_dc0'),
('monge_elkan'), ('lev_dist'), ('lev_sim'),
('needleman_wunsch'),
('smith_waterman')] # dlm_dc0 is the concrete space tokenizer
# Features for type str_bt_5w_10w
lookup_table['STR_BT_5W_10W'] = [('jaccard', 'qgm_3', 'qgm_3'),
('cosine', 'dlm_dc0', 'dlm_dc0'),
('monge_elkan'), ('lev_dist'), ('lev_sim')]
# Features for type str_gt_10w
lookup_table['STR_GT_10W'] = [('jaccard', 'qgm_3', 'qgm_3'),
('cosine', 'dlm_dc0', 'dlm_dc0')]
# Features for NUMERIC type
lookup_table['NUM'] = [('exact_match'), ('abs_norm'), ('lev_dist'),
('lev_sim')]
# Features for BOOLEAN type
lookup_table['BOOL'] = [('exact_match')]
# Features for un determined type
lookup_table['UN_DETERMINED'] = []
# Finally, return the lookup table
return lookup_table
def _get_features_for_type(column_type):
"""
Get features to be generated for a type
"""
# First get the look up table
lookup_table = _get_feat_lkp_tbl()
# Based on the column type, return the feature functions that should be
# generated.
if column_type is 'str_eq_1w':
features = lookup_table['STR_EQ_1W']
elif column_type is 'str_bt_1w_5w':
features = lookup_table['STR_BT_1W_5W']
elif column_type is 'str_bt_5w_10w':
features = lookup_table['STR_BT_5W_10W']
elif column_type is 'str_gt_10w':
features = lookup_table['STR_GT_10W']
elif column_type is 'numeric':
features = lookup_table['NUM']
elif column_type is 'boolean':
features = lookup_table['BOOL']
elif column_type is 'un_determined':
features = lookup_table['UN_DETERMINED']
else:
raise TypeError('Unknown type')
return features
def get_magellan_str_types():
"""
This function returns the py_entitymatching types as a list of strings.
"""
return ['str_eq_1w', 'str_bt_1w_5w', 'str_bt_5w_10w', 'str_gt_10w',
'numeric', 'boolean', 'un_determined']
# convert features from look up table to function objects
def _conv_func_objs(features, attributes,
tokenizer_functions, similarity_functions):
"""
Convert features from look up table to function objects
"""
# We need to check whether the features have allowed tokenizers and
# similarity functions.
# # First get the tokenizer and similarity functions list.
tokenizer_list = tokenizer_functions.keys()
similarity_functions_list = similarity_functions.keys()
# # Second get the features that uses only valid tokenizers and
# similarity functions
valid_list = [check_valid_tok_sim(feature, tokenizer_list,
similarity_functions_list)
for feature in features]
# Get function as a string and other meta data; finally we will get a
# list of tuples
function_tuples = [get_fn_str(input, attributes) for input in valid_list]
# Convert the function string into a function object
function_objects = conv_fn_str_to_obj(function_tuples, tokenizer_functions,
similarity_functions)
return function_objects
# check whether tokenizers and simfunctions are allowed
# inp is of the form ('jaccard', 'qgm_3', 'qgm_3') or ('lev')
def check_valid_tok_sim(inp, simlist, toklist):
if isinstance(inp, six.string_types):
inp = [inp]
assert len(inp) == 1 or len(
inp) == 3, 'len of feature config should be 1 or 3'
# check whether the sim function in features is in simlist
if len(set(inp).intersection(simlist)) > 0:
return inp
# check whether the tokenizer in features is in tok list
if len(set(inp).intersection(toklist)) > 0:
return inp
return None
# get function string for a feature
def get_fn_str(inp, attrs):
if inp:
args = []
args.extend(attrs)
if isinstance(inp, six.string_types) == True:
inp = [inp]
args.extend(inp)
# fill function string from a template
return fill_fn_template(*args)
else:
return None
# fill function template
def fill_fn_template(attr1, attr2, sim_func, tok_func_1=None, tok_func_2=None):
# construct function string
s = 'from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers import *\n'
# get the function name
fn_name = get_fn_name(attr1, attr2, sim_func, tok_func_1, tok_func_2)
# proceed with function construction
fn_st = 'def ' + fn_name + '(ltuple, rtuple):'
s += fn_st
s += '\n'
# add 4 spaces
s += ' '
fn_body = 'return '
if tok_func_1 is not None and tok_func_2 is not None:
fn_body = fn_body + sim_func + '(' + tok_func_1 + '(' + 'ltuple["' + attr1 + '"]'
fn_body += '), '
fn_body = fn_body + tok_func_2 + '(' + 'rtuple["' + attr2 + '"]'
fn_body = fn_body + ')) '
else:
fn_body = fn_body + sim_func + '(' + 'ltuple["' + attr1 + '"], rtuple["' + attr2 + '"])'
s += fn_body
return fn_name, attr1, attr2, tok_func_1, tok_func_2, sim_func, s
# construct function name from attrs, tokenizers and sim funcs
# sim_fn_names=['jaccard', 'lev', 'cosine', 'monge_elkan',
# 'needleman_wunsch', 'smith_waterman', 'jaro', 'jaro_winkler',
# 'exact_match', 'rel_diff', 'abs_norm']
def get_fn_name(attr1, attr2, sim_func, tok_func_1=None, tok_func_2=None):
attr1 = '_'.join(attr1.split())
attr2 = '_'.join(attr2.split())
fp = '_'.join([attr1, attr2])
name_lkp = dict()
name_lkp["jaccard"] = "jac"
name_lkp["lev_dist"] = "lev_dist"
name_lkp["lev_sim"] = "lev_sim"
name_lkp["cosine"] = "cos"
name_lkp["monge_elkan"] = "mel"
name_lkp["needleman_wunsch"] = "nmw"
name_lkp["smith_waterman"] = "sw"
name_lkp["jaro"] = "jar"
name_lkp["jaro_winkler"] = "jwn"
name_lkp["exact_match"] = "exm"
name_lkp["abs_norm"] = "anm"
name_lkp["rel_diff"] = "rdf"
name_lkp["1"] = "1"
name_lkp["2"] = "2"
name_lkp["3"] = "3"
name_lkp["4"] = "4"
name_lkp["tok_whitespace"] = "wsp"
name_lkp["tok_qgram"] = "qgm"
name_lkp["tok_delim"] = "dlm"
arg_list = [sim_func, tok_func_1, tok_func_2]
nm_list = [name_lkp.get(tok, tok) for tok in arg_list if tok]
sp = '_'.join(nm_list)
return '_'.join([fp, sp])
# conv function string to function object and return with meta data
def conv_fn_str_to_obj(fn_tup, tok, sim_funcs):
d_orig = {}
d_orig.update(tok)
d_orig.update(sim_funcs)
d_ret_list = []
for f in fn_tup:
d_ret = {}
name = f[0]
attr1 = f[1]
attr2 = f[2]
tok_1 = f[3]
tok_2 = f[4]
simfunction = f[5]
# exec(f[6] in d_orig)
six.exec_(f[6], d_orig)
d_ret['function'] = d_orig[name]
d_ret['feature_name'] = name
d_ret['left_attribute'] = attr1
d_ret['right_attribute'] = attr2
d_ret['left_attr_tokenizer'] = tok_1
d_ret['right_attr_tokenizer'] = tok_2
d_ret['simfunction'] = simfunction
d_ret['function_source'] = f[6]
d_ret['is_auto_generated'] = True
d_ret_list.append(d_ret)
return d_ret_list
def flatten_list(inp_list):
return [item for sublist in inp_list for item in sublist]
# Show the user inferred attribute types and features and request
# user permission to proceed
def validate_attr_types(l_attr_types, r_attr_types, attr_corres):
# # We expect the l_attr_types to be of type dictionary
validate_object_type(l_attr_types, dict, 'Left table attribute types')
# # We expect the r_attr_types to be of type dictionary
validate_object_type(r_attr_types, dict, 'Right table attribute types')
# # We expect the attr_corres to be of type dictionary
validate_object_type(attr_corres, dict, 'Left table attribute types')
corres_features_list = []
print('The table shows the corresponding attributes along with their respective '
'types.\nPlease confirm that the information has been correctly inferred.\n'
'If you would like to skip this validation process in the future,\nplease set '
'the flag validate_inferred_attr_types equal to false.')
# Generate features for each attr. correspondence
for ac in attr_corres['corres']:
l_attr_type = l_attr_types[ac[0]]
r_attr_type = r_attr_types[ac[1]]
# get human readable type names
readable_l_type = _get_readable_type_name(l_attr_type)
readable_r_type = _get_readable_type_name(r_attr_type)
# List and string of readable feature descriptions
readable_features = []
# Generate a feature only if the attribute types are same
if l_attr_type != r_attr_type:
logger.info('py_entitymatching types: %s type (%s) and %s type (%s) '
'are different.'
'If you want to set them to be same and '
'generate features, '
'update output from get_attr_types and '
'use get_features command.\n.'
% (ac[0], l_attr_type, ac[1], r_attr_type))
features = ['N/A']
else:
# Generate features
features = _get_features_for_type(l_attr_type)
# Change features into more readable format
for feat in features:
readable_features.append(_get_readable_feature_name(feat))
# Change the first 3 values in the list of features into a string
readable_features_str = "; ".join(readable_features[:2])
# Add information for each set of corresponding attributes to the list
# corres_features_list.append([ac[0], readable_l_type, ac[1], readable_r_type, readable_features_str])
corres_features_list.append([ac[0], ac[1], readable_l_type, readable_r_type, readable_features_str])
# create and display table for the user
# Create the pandas dataframe from the lists
labels = ['Left Attribute', 'Right Attribute', 'Left Attribute Type', 'Right Attribute Type',
'Example Features']
corres_feat_df = pd.DataFrame(corres_features_list, columns=labels)
# display the pandas dataframe
display(corres_feat_df)
# Ask user if the inferred types are satisfactory. Repeat until satisfactory answer is reached
while True:
response = six.moves.input('Do you want to proceed? (y/n):')
if response == 'y':
return corres_feat_df
elif response == 'n':
print('\nIf the attribute correspondences or types have been inferred incorrectly,\n'
'use the get_features() function with your own correspondences and attribute\n'
'types to get the correct features for your data')
return None
else:
print("You must answer with either 'y' or 'n'")
# get look up table to generate readable type names
def _get_type_name_lkp_tbl():
# Initialize a lookup table
lookup_table = dict()
# Map type names to more human readable names
lookup_table['str_eq_1w'] = 'short string (1 word)'
lookup_table['str_bt_1w_5w'] = 'short string (1 word to 5 words)'
lookup_table['str_bt_5w_10w'] = 'medium string (5 words to 10 words)'
lookup_table['str_gt_10w'] = 'short string (1 word)'
lookup_table['numeric'] = 'numeric'
lookup_table['boolean'] = 'boolean'
lookup_table['un_determined'] = 'un-determined type'
return lookup_table
# Get readable names for the types
def _get_readable_type_name(column_type):
# First get the look up table
lookup_table = _get_type_name_lkp_tbl()
# Check if the column type is in the dictionary
if column_type in lookup_table:
return lookup_table[column_type]
else:
raise TypeError('Unknown type')
# get look up table to generate readable feature descriptions
def _get_feature_name_lkp_tbl():
# Initialize a lookup table
lookup_table = dict()
# Map features to more human readable descriptions
lookup_table['lev_dist'] = 'Levenshtein Distance'
lookup_table['lev_sim'] = 'Levenshtein Similarity'
lookup_table['jaro'] = 'Jaro Distance'
lookup_table['jaro_winkler'] = 'Jaro-Winkler Distance'
lookup_table['exact_match'] = 'Exact Match'
lookup_table['needleman_wunsch'] = 'Needleman-Wunsch Algorithm'
lookup_table['smith_waterman'] = 'Smith-Waterman Algorithm'
lookup_table['abs_norm'] = 'Absolute Norm'
lookup_table['jaccard'] = 'Jaccard Similarity'
lookup_table['monge_elkan'] = 'Monge-Elkan Algorithm'
lookup_table['cosine'] = 'Cosine Similarity'
lookup_table['qgm_1'] = "1-grams"
lookup_table['qgm_2'] = "2-grams"
lookup_table['qgm_3'] = "3-grams"
lookup_table['qgm_4'] = "4-grams"
lookup_table['dlm_dc0'] = 'Space Delimiter'
lookup_table['dlm_wsp'] = 'Whitespace Delimiter'
lookup_table['N/A'] = 'Not Applicable: Types do not match'
return lookup_table
# Get readable names for the features
def _get_readable_feature_name(feature):
# First get the look up table
lookup_table = _get_feature_name_lkp_tbl()
readable_feature = []
if isinstance(feature, six.string_types):
# If feature is just a string, return the readable name
if feature in lookup_table:
return lookup_table[feature]
else:
raise AssertionError('Feature is not present in lookup table')
elif len(feature) == 3:
# If feature is a list, get the readable name of each part
for name in feature:
# Check if the feature is in the dictionary
if name in lookup_table:
readable_feature.append(lookup_table[name])
else:
raise AssertionError('Feature is not present in lookup table')
return readable_feature[0] + ' [' + readable_feature[1] + ', ' + readable_feature[2] + "]"
else:
raise AssertionError('Features should have either 0 or 2 (one for each table) tokenizers')