Source code for py_entitymatching.feature.simfunctions

# coding=utf-8
"""
This module contains similarity functions supported by py_entitymatching
"""

import pandas as pd
import numpy as np
import six

import py_stringmatching as sm
import py_entitymatching.utils.generic_helper as gh

# These are the sim. function names
sim_function_names = ['affine',
                'hamming_dist', 'hamming_sim',
                'lev_dist', 'lev_sim',
                'jaro',
                'jaro_winkler',
                'needleman_wunsch',
                'smith_waterman',
                'overlap_coeff', 'jaccard', 'dice',
                'monge_elkan', 'cosine',
                'exact_match', 'rel_diff', 'abs_norm'
                      ]

# abbreviations for sim. functions
abbreviations = ['aff',
       'ham_dist', 'ham_sim',
       'lev_dist', 'lev_sim',
       'jar',
       'jwn',
       'nmw',
       'swn',
       'ovrlp', 'jac', 'dice',
       'mel', 'cos',
       'exm', 'rdf', 'anm']

# global function names
_global_sim_fns = pd.DataFrame({'function_name': sim_function_names,
                                'short_name': abbreviations})


[docs]def get_sim_funs_for_blocking(): """ This function returns the similarity functions that can be used for blocking purposes. Returns: A Python dictionary containing the similarity functions. Specifically, the key is the similarity function name and the value is the actual similary function. Examples: >>> import py_entitymatching as em >>> block_s = em.get_sim_funs_for_blocking() """ return get_sim_funs()
[docs]def get_sim_funs_for_matching(): """ This function returns the similarity functions that can be used for matching purposes. Returns: A Python dictionary containing the similarity functions. Specifically, the key is the similarity function name and the value is the actual similarity function. Examples: >>> import py_entitymatching as em >>> match_s = em.get_sim_funs_for_matching() """ return get_sim_funs()
def get_sim_funs(): """ This function returns all the similarity functions supported by py_entitymatching. """ # Get all the functions functions = [affine, hamming_dist, hamming_sim, lev_dist, lev_sim, jaro, jaro_winkler, needleman_wunsch, smith_waterman, overlap_coeff, jaccard, dice, monge_elkan, cosine, exact_match, rel_diff, abs_norm] # Return a dictionary with the functions names as the key and the actual # functions as values. return dict(zip(sim_function_names, functions)) ## String based similarity measures
[docs]def affine(s1, s2): """ This function computes the affine measure between the two input strings. Args: s1,s2 (string ): The input strings for which the similarity measure should be computed. Returns: The affine measure if both the strings are not missing (i.e NaN or None), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.affine('dva', 'deeva') 1.5 >>> em.affine(None, 'deeva') nan """ if s1 is None or s2 is None: return np.NaN if pd.isnull(s1) or pd.isnull(s2): return np.NaN # Create the similarity measure object measure = sm.Affine() # if not isinstance(s1, six.string_types): # s1 = six.u(str(s1)) # # if isinstance(s1, bytes): # s1 = s1.decode('utf-8', 'ignore') # # if not isinstance(s2, six.string_types): # s2 = six.u(str(s2)) # # if isinstance(s2, bytes): # s2 = s2.decode('utf-8', 'ignore') s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the similarity return measure.get_raw_score(s1, s2)
[docs]def hamming_dist(s1, s2): """ This function computes the Hamming distance between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Hamming distance if both the strings are not missing (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.hamming_dist('alex', 'john') 4 >>> em.hamming_dist(None, 'john') nan """ if s1 is None or s2 is None: return np.NaN if pd.isnull(s1) or pd.isnull(s2): return np.NaN # Create the similarity measure object measure = sm.HammingDistance() s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the distance return measure.get_raw_score(s1, s2)
[docs]def hamming_sim(s1, s2): """ This function computes the Hamming similarity between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Hamming similarity if both the strings are not missing (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.hamming_sim('alex', 'alxe') 0.5 >>> em.hamming_sim(None, 'alex') nan """ if s1 is None or s2 is None: return np.NaN if pd.isnull(s1) or pd.isnull(s2): return np.NaN # Create the similarity measure object measure = sm.HammingDistance() s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the similarity score. return measure.get_sim_score(s1, s2)
[docs]def lev_dist(s1, s2): """ This function computes the Levenshtein distance between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Levenshtein distance if both the strings are not missing (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.lev_dist('alex', 'alxe') 2 >>> em.lev_dist(None, 'alex') nan """ if s1 is None or s2 is None: return np.NaN if pd.isnull(s1) or pd.isnull(s2): return np.NaN # Create the similarity measure object measure = sm.Levenshtein() s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the distance measure. return measure.get_raw_score(s1, s2)
[docs]def lev_sim(s1, s2): """ This function computes the Levenshtein similarity between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Levenshtein similarity if both the strings are not missing (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.lev_sim('alex', 'alxe') 0.5 >>> em.lev_dist(None, 'alex') nan """ if s1 is None or s2 is None: return np.NaN if pd.isnull(s1) or pd.isnull(s2): return np.NaN # Create the similarity measure object measure = sm.Levenshtein() s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the similarity measure return measure.get_sim_score(s1, s2)
[docs]def jaro(s1, s2): """ This function computes the Jaro measure between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Jaro measure if both the strings are not missing (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.jaro('MARTHA', 'MARHTA') 0.9444444444444445 >>> em.jaro(None, 'MARTHA') nan """ if s1 is None or s2 is None: return np.NaN if pd.isnull(s1) or pd.isnull(s2): return np.NaN # Create the similarity measure object measure = sm.Jaro() s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the similarity measure return measure.get_raw_score(s1, s2)
[docs]def jaro_winkler(s1, s2): """ This function computes the Jaro Winkler measure between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Jaro Winkler measure if both the strings are not missing (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.jaro_winkler('MARTHA', 'MARHTA') 0.9611111111111111 >>> >>> em.jaro_winkler('MARTHA', None) nan """ if s1 is None or s2 is None: return np.NaN if pd.isnull(s1) or pd.isnull(s2): return np.NaN # Create the similarity measure object measure = sm.JaroWinkler() s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the similarity measure return measure.get_raw_score(s1, s2)
[docs]def needleman_wunsch(s1, s2): """ This function computes the Needleman-Wunsch measure between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Needleman-Wunsch measure if both the strings are not missing (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.needleman_wunsch('dva', 'deeva') 1.0 >>> em.needleman_wunsch('dva', None) nan """ if s1 is None or s2 is None: return np.NaN if pd.isnull(s1) or pd.isnull(s2): return np.NaN # Create the similarity measure object measure = sm.NeedlemanWunsch() s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the similarity measure return measure.get_raw_score(s1, s2)
[docs]def smith_waterman(s1, s2): """ This function computes the Smith-Waterman measure between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Smith-Waterman measure if both the strings are not missing (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.smith_waterman('cat', 'hat') 2.0 >>> em.smith_waterman('cat', None) nan """ if s1 is None or s2 is None: return np.NaN if pd.isnull(s1) or pd.isnull(s2): return np.NaN # Create the similarity measure object measure = sm.SmithWaterman() s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the similarity measure return measure.get_raw_score(s1, s2)
# Token-based measures
[docs]def jaccard(arr1, arr2): """ This function computes the Jaccard measure between the two input lists/sets. Args: arr1,arr2 (list or set): The input list or sets for which the Jaccard measure should be computed. Returns: The Jaccard measure if both the lists/set are not None and do not have any missing tokens (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.jaccard(['data', 'science'], ['data']) 0.5 >>> em.jaccard(['data', 'science'], None) nan """ if arr1 is None or arr2 is None: return np.NaN if not isinstance(arr1, list): arr1 = [arr1] if any(pd.isnull(arr1)): return np.NaN if not isinstance(arr2, list): arr2 = [arr2] if any(pd.isnull(arr2)): return np.NaN # Create jaccard measure object measure = sm.Jaccard() # Call a function to compute a similarity score return measure.get_raw_score(arr1, arr2)
[docs]def cosine(arr1, arr2): """ This function computes the cosine measure between the two input lists/sets. Args: arr1,arr2 (list or set): The input list or sets for which the cosine measure should be computed. Returns: The cosine measure if both the lists/set are not None and do not have any missing tokens (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.cosine(['data', 'science'], ['data']) 0.7071067811865475 >>> em.cosine(['data', 'science'], None) nan """ if arr1 is None or arr2 is None: return np.NaN if not isinstance(arr1, list): arr1 = [arr1] if any(pd.isnull(arr1)): return np.NaN if not isinstance(arr2, list): arr2 = [arr2] if any(pd.isnull(arr2)): return np.NaN # Create cosine measure object measure = sm.Cosine() # Call the function to compute the cosine measure. return measure.get_raw_score(arr1, arr2)
[docs]def overlap_coeff(arr1, arr2): """ This function computes the overlap coefficient between the two input lists/sets. Args: arr1,arr2 (list or set): The input lists or sets for which the overlap coefficient should be computed. Returns: The overlap coefficient if both the lists/sets are not None and do not have any missing tokens (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.overlap_coeff(['data', 'science'], ['data']) 1.0 >>> em.overlap_coeff(['data', 'science'], None) nan """ if arr1 is None or arr2 is None: return np.NaN if not isinstance(arr1, list): arr1 = [arr1] if any(pd.isnull(arr1)): return np.NaN if not isinstance(arr2, list): arr2 = [arr2] if any(pd.isnull(arr2)): return np.NaN # Create overlap coefficient measure object measure = sm.OverlapCoefficient() # Call the function to return the overlap coefficient return measure.get_raw_score(arr1, arr2)
[docs]def dice(arr1, arr2): """ This function computes the Dice score between the two input lists/sets. Args: arr1,arr2 (list or set): The input list or sets for which the Dice score should be computed. Returns: The Dice score if both the lists/set are not None and do not have any missing tokens (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.dice(['data', 'science'], ['data']) 0.6666666666666666 >>> em.dice(['data', 'science'], None) nan """ if arr1 is None or arr2 is None: return np.NaN if not isinstance(arr1, list): arr1 = [arr1] if any(pd.isnull(arr1)): return np.NaN if not isinstance(arr2, list): arr2 = [arr2] if any(pd.isnull(arr2)): return np.NaN # Create Dice object measure = sm.Dice() # Call the function to return the dice score return measure.get_raw_score(arr1, arr2)
# Hybrid measure
[docs]def monge_elkan(arr1, arr2): """ This function computes the Monge-Elkan measure between the two input lists/sets. Specifically, this function uses Jaro-Winkler measure as the secondary function to compute the similarity score. Args: arr1,arr2 (list or set): The input list or sets for which the Monge-Elkan measure should be computed. Returns: The Monge-Elkan measure if both the lists/set are not None and do not have any missing tokens (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.monge_elkan(['Niall'], ['Neal']) 0.8049999999999999 >>> em.monge_elkan(['Niall'], None) nan """ if arr1 is None or arr2 is None: return np.NaN if not isinstance(arr1, list): arr1 = [arr1] if any(pd.isnull(arr1)): return np.NaN if not isinstance(arr2, list): arr2 = [arr2] if any(pd.isnull(arr2)): return np.NaN # Create Monge-Elkan measure object measure = sm.MongeElkan() # Call the function to compute the Monge-Elkan measure return measure.get_raw_score(arr1, arr2)
# boolean/string/numeric similarity measure
[docs]def exact_match(d1, d2): """ This function check if two objects are match exactly. Typically the objects are string, boolean and ints. Args: d1,d2 (str, boolean, int): The input objects which should checked whether they match exactly. Returns: A value of 1 is returned if they match exactly, else returns 0. Further if one of the objects is NaN or None, it returns NaN. Examples: >>> import py_entitymatching as em >>> em.exact_match('Niall', 'Neal') 0 >>> em.exact_match('Niall', 'Niall') 1 >>> em.exact_match(10, 10) 1 >>> em.exact_match(10, 20) 0 >>> em.exact_match(True, True) 1 >>> em.exact_match(False, True) 0 >>> em.exact_match(10, None) nan """ if d1 is None or d2 is None: return np.NaN if pd.isnull(d1) or pd.isnull(d2): return np.NaN # Check if they match exactly if d1 == d2: return 1 else: return 0
# numeric similarity measure
[docs]def rel_diff(d1, d2): """ This function computes the relative difference between two numbers Args: d1,d2 (float): The input numbers for which the relative difference must be computed. Returns: A float value of relative difference between the input numbers (if they are valid). Further if one of the input objects is NaN or None, it returns NaN. Examples: >>> import py_entitymatching as em >>> em.rel_diff(100, 200) 0.6666666666666666 >>> em.rel_diff(100, 100) 0.0 >>> em.rel_diff(100, None) nan """ if d1 is None or d2 is None: return np.NaN if pd.isnull(d1) or pd.isnull(d2): return np.NaN try: d1 = float(d1) d2 = float(d2) except ValueError: return np.NaN if d1 == 0.0 and d2 == 0.0: return 0 else: # Compute the relative difference between two numbers # ref: https://en.wikipedia.org/wiki/Relative_change_and_difference x = (2*abs(d1 - d2)) / (d1 + d2) return x
# compute absolute norm similarity
[docs]def abs_norm(d1, d2): """ This function computes the absolute norm similarity between two numbers Args: d1,d2 (float): Input numbers for which the absolute norm must be computed. Returns: A float value of absolute norm between the input numbers (if they are valid). Further if one of the input objects is NaN or None, it returns NaN. Examples: >>> import py_entitymatching as em >>> em.abs_norm(100, 200) 0.5 >>> em.abs_norm(100, 100) 1.0 >>> em.abs_norm(100, None) nan """ if d1 is None or d2 is None: return np.NaN if pd.isnull(d1) or pd.isnull(d2): return np.NaN try: d1 = float(d1) d2 = float(d2) except ValueError: return np.NaN if d1 == 0.0 and d2 == 0.0: return 0 else: # Compute absolute norm similarity between two numbers. x = (abs(d1 - d2) / max(abs(d1), abs(d2))) if x <= 10e-5: x = 0 return 1.0 - x