Source code for py_stringmatching.similarity_measure.soft_tfidf

from __future__ import division
from math import sqrt
import collections

from py_stringmatching import utils
from py_stringmatching.similarity_measure.jaro import Jaro
from py_stringmatching.similarity_measure.hybrid_similarity_measure import \
                                                    HybridSimilarityMeasure


[docs]class SoftTfIdf(HybridSimilarityMeasure):
    """Computes soft TF/IDF measure. 
    
    Note:
        Currently, this measure is implemented without dampening. This is similar to setting dampen flag to be False in TF-IDF.
        We plan to add the dampen flag in the next release.   

    Args:
        corpus_list (list of lists): Corpus list (default is set to None) of strings. If set to None,
                                     the input list are considered the only corpus.
        sim_func (function): Secondary similarity function. This should return a similarity score between two strings (optional),
                             default is the Jaro similarity measure.
        threshold (float): Threshold value for the secondary similarity function (defaults to 0.5). If the similarity
                           of a token pair exceeds the threshold, then the token pair is considered a match.

    Attributes:
        sim_func (function): An attribute to store the secondary similarity function.
        threshold (float): An attribute to store the threshold value for the secondary similarity function. 
    """

    def __init__(self, corpus_list=None, sim_func=Jaro().get_raw_score,
                 threshold=0.5):
        self.__corpus_list = corpus_list
        self.__document_frequency = {}
        self.__compute_document_frequency()
        self.__corpus_size = 0 if self.__corpus_list is None else (
                                                         len(self.__corpus_list))
        self.sim_func = sim_func
        self.threshold = threshold
        super(SoftTfIdf, self).__init__()

[docs]    def get_raw_score(self, bag1, bag2):
        """Computes the raw soft TF/IDF score between two lists given the corpus information.

        Args:
            bag1,bag2 (list): Input lists

        Returns:
            Soft TF/IDF score between the input lists (float).

        Raises:
            TypeError : If the inputs are not lists or if one of the inputs is None.

        Examples:
            >>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], sim_func=Jaro().get_raw_score, threshold=0.8)
            >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'c'])
            0.17541160386140586
            >>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], threshold=0.9)
            >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
            0.5547001962252291
            >>> soft_tfidf = SoftTfIdf([['x', 'y'], ['w'], ['q']])
            >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
            0.0
            >>> soft_tfidf = SoftTfIdf(sim_func=Affine().get_raw_score, threshold=0.6)
            >>> soft_tfidf.get_raw_score(['aa', 'bb', 'a'], ['ab', 'ba'])
            0.81649658092772592

        References:
            * the string matching chapter of the "Principles of Data Integration" book.
        """
        
        # input validations
        utils.sim_check_for_none(bag1, bag2)
        utils.sim_check_for_list_or_set_inputs(bag1, bag2)

        # if the strings match exactly return 1.0
        if utils.sim_check_for_exact_match(bag1, bag2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(bag1, bag2):
            return 0

        # term frequency for input strings
        tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
        
        # find unique elements in the input lists and their document frequency 
        local_df = {}
        for element in tf_x:
            local_df[element] = local_df.get(element, 0) + 1
        for element in tf_y:
            local_df[element] = local_df.get(element, 0) + 1

        # if corpus is not provided treat input string as corpus
        curr_df, corpus_size = (local_df, 2) if self.__corpus_list is None else (
                                   (self.__document_frequency, self.__corpus_size))

        # calculating the term sim score against the input string 2,
        # construct similarity map
        similarity_map = {}
        for term_x in tf_x:
            max_score = 0.0
            for term_y in tf_y:
                score = self.sim_func(term_x, term_y)
                # adding sim only if it is above threshold and
                # highest for this element
                if score > self.threshold and score > max_score:
                    similarity_map[term_x] = (term_x, term_y, score)
                    max_score = score

        # position of first string, second string and sim score
        # in the tuple
        first_string_pos = 0
        second_string_pos = 1
        sim_score_pos = 2

        result, v_x_2, v_y_2 = 0.0, 0.0, 0.0
        # soft-tfidf calculation
        for element in local_df.keys():
            if curr_df.get(element) is None:
                continue
            # numerator
            if element in similarity_map:
                sim = similarity_map[element]
                idf_first = corpus_size / curr_df.get(sim[first_string_pos], 1)
                idf_second = corpus_size / curr_df.get(sim[second_string_pos], 1)
                v_x = idf_first * tf_x.get(sim[first_string_pos], 0)
                v_y = idf_second * tf_y.get(sim[second_string_pos], 0)
                result += v_x * v_y * sim[sim_score_pos]
            # denominator
            idf = corpus_size / curr_df[element]
            v_x = idf * tf_x.get(element, 0)
            v_x_2 += v_x * v_x
            v_y = idf * tf_y.get(element, 0)
            v_y_2 += v_y * v_y
        return result if v_x_2 == 0 else result / (sqrt(v_x_2) * sqrt(v_y_2))

[docs]    def get_corpus_list(self):
        """Get corpus list.

        Returns:
            corpus list (list of lists).
        """
        return self.__corpus_list

[docs]    def get_sim_func(self):
        """Get secondary similarity function.

        Returns:
            secondary similarity function (function).
        """
        return self.sim_func

[docs]    def get_threshold(self):
        """Get threshold used for the secondary similarity function.

        Returns:
            threshold (float).
        """
        return self.threshold

[docs]    def set_threshold(self, threshold):
        """Set threshold value for the secondary similarity function.

        Args:
            threshold (float): threshold value.
        """
        self.threshold = threshold
        return True

[docs]    def set_sim_func(self, sim_func):
        """Set secondary similarity function.

        Args:
            sim_func (function): Secondary similarity function.
        """
        self.sim_func = sim_func
        return True

[docs]    def set_corpus_list(self, corpus_list):
        """Set corpus list.

        Args:
            corpus_list (list of lists): Corpus list.
        """
        self.__corpus_list = corpus_list
        self.__document_frequency = {}
        self.__compute_document_frequency()
        self.__corpus_size = 0 if self.__corpus_list is None else (
                                                         len(self.__corpus_list))
        return True

    def __compute_document_frequency(self):
        if self.__corpus_list != None:
            for document in self.__corpus_list:
                for element in set(document):
                    self.__document_frequency[element] = (
                        self.__document_frequency.get(element, 0) + 1)