Source code for py_entitymatching.feature.simfunctions

# coding=utf-8
"""
This module contains similarity functions supported by py_entitymatching
"""

import pandas as pd
import numpy as np
import six

import py_stringmatching as sm
import py_entitymatching.utils.generic_helper as gh

# These are the sim. function names
sim_function_names = ['affine',
                'hamming_dist', 'hamming_sim',
                'lev_dist', 'lev_sim',
                'jaro',
                'jaro_winkler',
                'needleman_wunsch',
                'smith_waterman',
                'overlap_coeff', 'jaccard', 'dice',
                'monge_elkan', 'cosine',
                'exact_match', 'rel_diff', 'abs_norm'
                      ]

# abbreviations for sim. functions
abbreviations = ['aff',
       'ham_dist', 'ham_sim',
       'lev_dist', 'lev_sim',
       'jar',
       'jwn',
       'nmw',
       'swn',
       'ovrlp', 'jac', 'dice',
       'mel', 'cos',
       'exm', 'rdf', 'anm']

# global function names
_global_sim_fns = pd.DataFrame({'function_name': sim_function_names,
                                'short_name': abbreviations})


[docs]def get_sim_funs_for_blocking():
    """
    This function returns the similarity functions that can be used for
    blocking purposes.

    Returns:

        A Python dictionary containing the similarity functions.


        Specifically, the key is the similarity function name and the value
        is the actual similary function.

    Examples:
        >>> import py_entitymatching as em
        >>> block_s = em.get_sim_funs_for_blocking()

    """
    return get_sim_funs()


[docs]def get_sim_funs_for_matching():
    """
    This function returns the similarity functions that can be used for
    matching purposes.

    Returns:

        A Python dictionary containing the similarity functions.

        Specifically, the key is the similarity function name and the value
        is the actual similarity function.

    Examples:
        >>> import py_entitymatching as em
        >>> match_s = em.get_sim_funs_for_matching()


    """
    return get_sim_funs()


def get_sim_funs():
    """
    This function returns all the similarity functions supported by py_entitymatching.

    """
    # Get all the functions
    functions = [affine,
           hamming_dist, hamming_sim,
           lev_dist, lev_sim,
           jaro,
           jaro_winkler,
           needleman_wunsch,
           smith_waterman,
           overlap_coeff, jaccard, dice,
           monge_elkan, cosine,
           exact_match, rel_diff, abs_norm]
    # Return a dictionary with the functions names as the key and the actual
    # functions as values.
    return dict(zip(sim_function_names, functions))


## String based similarity measures
[docs]def affine(s1, s2):
    """
    This function computes the affine measure between the two input strings.

    Args:
        s1,s2 (string ): The input strings for which the similarity measure
            should be computed.

    Returns:
        The affine measure if both the strings are not missing (i.e NaN or
        None), else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.affine('dva', 'deeva')
        1.5
        >>> em.affine(None, 'deeva')
        nan
    """
    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN

    # Create the similarity measure object
    measure = sm.Affine()

    # if not isinstance(s1, six.string_types):
    #     s1 = six.u(str(s1))
    #
    # if isinstance(s1, bytes):
    #     s1 = s1.decode('utf-8', 'ignore')
    #
    # if not isinstance(s2, six.string_types):
    #     s2 = six.u(str(s2))
    #
    # if isinstance(s2, bytes):
    #     s2 = s2.decode('utf-8', 'ignore')

    s1 = gh.convert_to_str_unicode(s1)
    s2 = gh.convert_to_str_unicode(s2)

    # Call the function to compute the similarity
    return measure.get_raw_score(s1, s2)

[docs]def hamming_dist(s1, s2):
    """
    This function computes the Hamming distance between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Hamming distance if both the strings are not missing (i.e NaN),
        else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.hamming_dist('alex', 'john')
        4
        >>> em.hamming_dist(None, 'john')
        nan


    """

    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN

    # Create the similarity measure object
    measure = sm.HammingDistance()

    s1 = gh.convert_to_str_unicode(s1)
    s2 = gh.convert_to_str_unicode(s2)


    # Call the function to compute the distance
    return measure.get_raw_score(s1, s2)


[docs]def hamming_sim(s1, s2):
    """
    This function computes the Hamming similarity between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Hamming similarity if both the strings are not missing (i.e NaN),
        else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.hamming_sim('alex', 'alxe')
        0.5
        >>> em.hamming_sim(None, 'alex')
        nan

    """

    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN

    # Create the similarity measure object
    measure = sm.HammingDistance()

    s1 = gh.convert_to_str_unicode(s1)
    s2 = gh.convert_to_str_unicode(s2)

    # Call the function to compute the similarity score.
    return measure.get_sim_score(s1, s2)


[docs]def lev_dist(s1, s2):
    """
    This function computes the Levenshtein distance between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Levenshtein distance if both the strings are not missing (i.e NaN),
        else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.lev_dist('alex', 'alxe')
        2
        >>> em.lev_dist(None, 'alex')
        nan

    """

    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN

    # Create the similarity measure object
    measure = sm.Levenshtein()

    s1 = gh.convert_to_str_unicode(s1)
    s2 = gh.convert_to_str_unicode(s2)

    # Call the function to compute the distance measure.
    return measure.get_raw_score(s1, s2)


[docs]def lev_sim(s1, s2):
    """
    This function computes the Levenshtein similarity between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Levenshtein similarity if both the strings are not missing (i.e
        NaN), else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.lev_sim('alex', 'alxe')
        0.5
        >>> em.lev_dist(None, 'alex')
        nan

    """

    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN

    # Create the similarity measure object
    measure = sm.Levenshtein()

    s1 = gh.convert_to_str_unicode(s1)
    s2 = gh.convert_to_str_unicode(s2)

    # Call the function to compute the similarity measure
    return measure.get_sim_score(s1, s2)


[docs]def jaro(s1, s2):
    """
    This function computes the Jaro measure between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Jaro measure if both the strings are not missing (i.e NaN),
        else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.jaro('MARTHA', 'MARHTA')
        0.9444444444444445
        >>> em.jaro(None, 'MARTHA')
        nan
    """

    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN

    # Create the similarity measure object
    measure = sm.Jaro()

    s1 = gh.convert_to_str_unicode(s1)
    s2 = gh.convert_to_str_unicode(s2)

    # Call the function to compute the similarity measure
    return measure.get_raw_score(s1, s2)


[docs]def jaro_winkler(s1, s2):
    """
    This function computes the Jaro Winkler measure between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Jaro Winkler measure if both the strings are not missing (i.e NaN),
        else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.jaro_winkler('MARTHA', 'MARHTA')
        0.9611111111111111
        >>> >>> em.jaro_winkler('MARTHA', None)
        nan

    """

    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN

    # Create the similarity measure object
    measure = sm.JaroWinkler()

    s1 = gh.convert_to_str_unicode(s1)
    s2 = gh.convert_to_str_unicode(s2)

    # Call the function to compute the similarity measure
    return measure.get_raw_score(s1, s2)


[docs]def needleman_wunsch(s1, s2):
    """
    This function computes the Needleman-Wunsch measure between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Needleman-Wunsch measure if both the strings are not missing (i.e
        NaN), else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.needleman_wunsch('dva', 'deeva')
        1.0
        >>> em.needleman_wunsch('dva', None)
        nan


    """

    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN

    # Create the similarity measure object
    measure = sm.NeedlemanWunsch()

    s1 = gh.convert_to_str_unicode(s1)
    s2 = gh.convert_to_str_unicode(s2)

    # Call the function to compute the similarity measure
    return measure.get_raw_score(s1, s2)


[docs]def smith_waterman(s1, s2):
    """
    This function computes the Smith-Waterman measure between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Smith-Waterman measure if both the strings are not missing (i.e
        NaN), else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.smith_waterman('cat', 'hat')
        2.0
        >>> em.smith_waterman('cat', None)
        nan
    """

    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN

    # Create the similarity measure object
    measure = sm.SmithWaterman()

    s1 = gh.convert_to_str_unicode(s1)
    s2 = gh.convert_to_str_unicode(s2)

    # Call the function to compute the similarity measure
    return measure.get_raw_score(s1, s2)


# Token-based measures
[docs]def jaccard(arr1, arr2):
    """
    This function computes the Jaccard measure between the two input
    lists/sets.

    Args:
        arr1,arr2 (list or set): The input list or sets for which the Jaccard
            measure should be computed.

    Returns:
        The Jaccard measure if both the lists/set are not None and do not have
        any missing tokens (i.e NaN), else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.jaccard(['data', 'science'], ['data'])
        0.5
        >>> em.jaccard(['data', 'science'], None)
        nan
    """

    if arr1 is None or arr2 is None:
        return np.NaN
    if not isinstance(arr1, list):
        arr1 = [arr1]
    if any(pd.isnull(arr1)):
        return np.NaN
    if not isinstance(arr2, list):
        arr2 = [arr2]
    if any(pd.isnull(arr2)):
        return np.NaN
    # Create jaccard measure object
    measure = sm.Jaccard()
    # Call a function to compute a similarity score
    return measure.get_raw_score(arr1, arr2)


[docs]def cosine(arr1, arr2):
    """
    This function computes the cosine measure between the two input
    lists/sets.

    Args:
        arr1,arr2 (list or set): The input list or sets for which the cosine
         measure should be computed.

    Returns:
        The cosine measure if both the lists/set are not None and do not have
        any missing tokens (i.e NaN), else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.cosine(['data', 'science'], ['data'])
        0.7071067811865475
        >>> em.cosine(['data', 'science'], None)
        nan

    """

    if arr1 is None or arr2 is None:
        return np.NaN
    if not isinstance(arr1, list):
        arr1 = [arr1]
    if any(pd.isnull(arr1)):
        return np.NaN
    if not isinstance(arr2, list):
        arr2 = [arr2]
    if any(pd.isnull(arr2)):
        return np.NaN
    # Create cosine measure object
    measure = sm.Cosine()
    # Call the function to compute the cosine measure.
    return measure.get_raw_score(arr1, arr2)


[docs]def overlap_coeff(arr1, arr2):
    """
    This function computes the overlap coefficient between the two input
    lists/sets.

    Args:
        arr1,arr2 (list or set): The input lists or sets for which the overlap
            coefficient should be computed.

    Returns:
        The overlap coefficient if both the lists/sets are not None and do not
        have any missing tokens (i.e NaN), else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.overlap_coeff(['data', 'science'], ['data'])
        1.0
        >>> em.overlap_coeff(['data', 'science'], None)
        nan

    """

    if arr1 is None or arr2 is None:
        return np.NaN
    if not isinstance(arr1, list):
        arr1 = [arr1]
    if any(pd.isnull(arr1)):
        return np.NaN
    if not isinstance(arr2, list):
        arr2 = [arr2]
    if any(pd.isnull(arr2)):
        return np.NaN
    # Create overlap coefficient measure object
    measure = sm.OverlapCoefficient()
    # Call the function to return the overlap coefficient
    return measure.get_raw_score(arr1, arr2)

[docs]def dice(arr1, arr2):
    """
    This function computes the Dice score between the two input
    lists/sets.

    Args:
        arr1,arr2 (list or set): The input list or sets for which the Dice
            score should be computed.

    Returns:
        The Dice score if both the lists/set are not None and do not
        have any missing tokens (i.e NaN), else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.dice(['data', 'science'], ['data'])
        0.6666666666666666
        >>> em.dice(['data', 'science'], None)
        nan

    """

    if arr1 is None or arr2 is None:
        return np.NaN
    if not isinstance(arr1, list):
        arr1 = [arr1]
    if any(pd.isnull(arr1)):
        return np.NaN
    if not isinstance(arr2, list):
        arr2 = [arr2]
    if any(pd.isnull(arr2)):
        return np.NaN

    # Create Dice object
    measure = sm.Dice()
    # Call the function to return the dice score
    return measure.get_raw_score(arr1, arr2)

# Hybrid measure
[docs]def monge_elkan(arr1, arr2):
    """
    This function computes the Monge-Elkan measure between the two input
    lists/sets. Specifically, this function uses Jaro-Winkler measure as the
    secondary function to compute the similarity score.

    Args:
        arr1,arr2 (list or set): The input list or sets for which the
            Monge-Elkan measure should be computed.

    Returns:
        The Monge-Elkan measure if both the lists/set are not None and do not
        have any missing tokens (i.e NaN), else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.monge_elkan(['Niall'], ['Neal'])
        0.8049999999999999
        >>> em.monge_elkan(['Niall'], None)
        nan
    """

    if arr1 is None or arr2 is None:
        return np.NaN
    if not isinstance(arr1, list):
        arr1 = [arr1]
    if any(pd.isnull(arr1)):
        return np.NaN
    if not isinstance(arr2, list):
        arr2 = [arr2]
    if any(pd.isnull(arr2)):
        return np.NaN
    # Create Monge-Elkan measure object
    measure = sm.MongeElkan()
    # Call the function to compute the Monge-Elkan measure
    return measure.get_raw_score(arr1, arr2)




# boolean/string/numeric similarity measure
[docs]def exact_match(d1, d2):
    """
    This function check if two objects are match exactly. Typically the
    objects are string, boolean and ints.

    Args:
        d1,d2 (str, boolean, int): The input objects which should checked
            whether they match exactly.

    Returns:
        A value of 1 is returned if they match exactly,
        else returns 0. Further if one of the objects is NaN or None,
        it returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.exact_match('Niall', 'Neal')
        0
        >>> em.exact_match('Niall', 'Niall')
        1
        >>> em.exact_match(10, 10)
        1
        >>> em.exact_match(10, 20)
        0
        >>> em.exact_match(True, True)
        1
        >>> em.exact_match(False, True)
        0
        >>> em.exact_match(10, None)
        nan
    """
    if d1 is None or d2 is None:
        return np.NaN
    if pd.isnull(d1) or pd.isnull(d2):
        return np.NaN
    # Check if they match exactly
    if d1 == d2:
        return 1
    else:
        return 0


# numeric similarity measure
[docs]def rel_diff(d1, d2):
    """
    This function computes the relative difference between two numbers

    Args:
        d1,d2 (float): The input numbers for which the relative difference
         must be computed.

    Returns:
        A float value of relative difference between the input numbers (if
        they are valid). Further if one of the input objects is NaN or None,
        it returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.rel_diff(100, 200)
        0.6666666666666666
        >>> em.rel_diff(100, 100)
        0.0
        >>> em.rel_diff(100, None)
        nan
    """

    if d1 is None or d2 is None:
        return np.NaN
    if pd.isnull(d1) or pd.isnull(d2):
        return np.NaN
    try:
        d1 = float(d1)
        d2 = float(d2)
    except ValueError:
        return np.NaN
    if d1 == 0.0 and d2 == 0.0:
        return 0
    else:
        # Compute the relative difference between two numbers
        # ref: https://en.wikipedia.org/wiki/Relative_change_and_difference
        x = (2*abs(d1 - d2)) / (d1 + d2)
        return x


# compute absolute norm similarity
[docs]def abs_norm(d1, d2):
    """
    This function computes the absolute norm similarity between two numbers

    Args:
        d1,d2 (float): Input numbers for which the absolute norm must
            be computed.

    Returns:
        A float value of absolute norm between the input numbers (if
        they are valid). Further if one of the input objects is NaN or None,
        it returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.abs_norm(100, 200)
        0.5
        >>> em.abs_norm(100, 100)
        1.0
        >>> em.abs_norm(100, None)
        nan

    """

    if d1 is None or d2 is None:
        return np.NaN
    if pd.isnull(d1) or pd.isnull(d2):
        return np.NaN
    try:
        d1 = float(d1)
        d2 = float(d2)
    except ValueError:
        return np.NaN
    if d1 == 0.0 and d2 == 0.0:
        return 0
    else:
        # Compute absolute norm similarity between two numbers.
        x = (abs(d1 - d2) / max(abs(d1), abs(d2)))
        if x <= 10e-5:
            x = 0
        return 1.0 - x