Source code for py_stringmatching.similarity_measure.generalized_jaccard

"""Generalized jaccard similarity measure"""

from py_stringmatching import utils
from py_stringmatching.similarity_measure.jaro import Jaro
from py_stringmatching.similarity_measure.hybrid_similarity_measure import \
                                                    HybridSimilarityMeasure


[docs]class GeneralizedJaccard(HybridSimilarityMeasure):
    """Generalized jaccard similarity measure class.

    Parameters:
        sim_func (function): similarity function. This should return a similarity score between two strings in set (optional),
                             default is jaro similarity measure
        threshold (float): Threshold value (defaults to 0.5). If the similarity of a token pair exceeds the threshold,
                           then the token pair is considered a match.
    """
    def __init__(self, sim_func=Jaro().get_raw_score, threshold=0.5):
        self.sim_func = sim_func
        self.threshold = threshold
        super(GeneralizedJaccard, self).__init__()

[docs]    def get_raw_score(self, set1, set2):
        """
        Computes the Generalized Jaccard measure between two sets.

        This similarity measure is softened version of the Jaccard measure. The Jaccard measure is
        promising candidate for tokens which exactly match across the sets. However, in practice tokens
        are often misspelled, such as energy vs. eneryg. THe generalized Jaccard measure will enable
        matching in such cases.

        Args:
            set1,set2 (set or list): Input sets (or lists) of strings. Input lists are converted to sets.

        Returns:
            Generalized Jaccard similarity (float)

        Raises:
            TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
            ValueError : If the similarity measure doesn't return values in the range [0,1]

        Examples:
            >>> gj = GeneralizedJaccard()
            >>> gj.get_raw_score(['data', 'science'], ['data'])
            0.5
            >>> gj.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
            0.3333333333333333
            >>> gj.get_raw_score(['Niall'], ['Neal', 'Njall'])
            0.43333333333333335
            >>> gj = GeneralizedJaccard(sim_func=JaroWinkler().get_raw_score, threshold=0.8)
            >>> gj.get_raw_score(['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'],
                                 ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
            0.45810185185185187
        """
        # input validations
        utils.sim_check_for_none(set1, set2)
        utils.sim_check_for_list_or_set_inputs(set1, set2)

        # if exact match return 1.0
        if utils.sim_check_for_exact_match(set1, set2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(set1, set2):
            return 0

        if not isinstance(set1, set):
            set1 = set(set1)
        if not isinstance(set2, set):
            set2 = set(set2)

        set1_x = set()
        set2_y = set()
        match_score = 0.0
        match_count = 0
        list_matches = []
        for element in set1:
            for item in set2:
                score = self.sim_func(element, item)
                if score > 1 or score < 0:
                    raise ValueError('Similarity measure should' + \
                                     ' return value in the range [0,1]')
                if score > self.threshold:
                    list_matches.append((element, item, score))

        # position of first string, second string and sim score in tuple
        first_string_pos = 0
        second_string_pos = 1
        sim_score_pos = 2

        # sort the score of all the pairs
        list_matches.sort(key=lambda x: x[sim_score_pos], reverse=True)

        # select score in increasing order of their weightage, 
        # do not reselect the same element from either set.
        for element in list_matches:
            if (element[first_string_pos] not in set1_x and
                element[second_string_pos] not in set2_y):
                set1_x.add(element[first_string_pos])
                set2_y.add(element[second_string_pos])
                match_score += element[sim_score_pos]
                match_count += 1

        return float(match_score) / float(len(set1) + len(set2) - match_count)

[docs]    def get_sim_score(self, set1, set2):
        """
        Computes the normalized Generalized Jaccard similarity between two sets.

        Args:
            set1,set2 (set or list): Input sets (or lists) of strings. Input lists are converted to sets.

        Returns:
            Normalized Generalized Jaccard similarity (float)

        Raises:
            TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
            ValueError : If the similarity measure doesn't return values in the range [0,1]

        Examples:
            >>> gj = GeneralizedJaccard()
            >>> gj.get_sim_score(['data', 'science'], ['data'])
            0.5
            >>> gj.get_sim_score(['data', 'management'], ['data', 'data', 'science'])
            0.3333333333333333
            >>> gj.get_sim_score(['Niall'], ['Neal', 'Njall'])
            0.43333333333333335
            >>> gj = GeneralizedJaccard(sim_func=JaroWinkler().get_raw_score, threshold=0.8)
            >>> gj.get_sim_score(['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'],
                                 ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
            0.45810185185185187

        """
        return self.get_raw_score(set1, set2)

[docs]    def get_sim_func(self):
        """
        Get similarity function

        Returns:
            similarity function (function)
        """
        return self.sim_func

[docs]    def get_threshold(self):
        """
        Get threshold used for the similarity function

        Returns:
            threshold (float)
        """
        return self.threshold

[docs]    def set_sim_func(self, sim_func):
        """
        Set similarity function

        Args:
            sim_func (function): similarity function
        """
        self.sim_func = sim_func
        return True

[docs]    def set_threshold(self, threshold):
        """
        Set threshold value for the similarity function

        Args:
            threshold (float): threshold value
        """
        self.threshold = threshold
        return True