Source code for py_stringmatching.similarity_measure.soundex

"""Soundex phonetic similarity measure"""

import re

from py_stringmatching import utils
from py_stringmatching.similarity_measure.phonetic_similarity_measure import \
                                                    PhoneticSimilarityMeasure


[docs]class Soundex(PhoneticSimilarityMeasure):
    """Soundex phonetic similarity measure class.
    """
    def __init__(self):
        super(Soundex, self).__init__()

[docs]    def get_raw_score(self, string1, string2):
        """
        Computes the Soundex phonetic similarity between two strings.

        Phonetic measure such as soundex match string based on their sound. These
        measures have been especially effective in matching names, since names are
        often spelled in different ways that sound the same. For example, Meyer, Meier,
        and Mire sound the same, as do Smith, Smithe, and Smythe.

        Soundex is used primarily to match surnames. It does not work as well for names
        of East Asian origins, because much of the discriminating power of these names
        resides in the vowel sounds, which the code ignores.

        Args:
            string1,string2 (str): Input strings

        Returns:
            Soundex similarity score (int) is returned

        Raises:
            TypeError : If the inputs are not strings

        Examples:
            >>> s = Soundex()
            >>> s.get_raw_score('Robert', 'Rupert')
            1
            >>> s.get_raw_score('Sue', 's')
            1
            >>> s.get_raw_score('Gough', 'Goff')
            0
            >>> s.get_raw_score('a,,li', 'ali')
            1

        """
        # input validations
        utils.sim_check_for_none(string1, string2)
        utils.sim_check_for_string_inputs(string1, string2)

        # remove all chars but alphanumeric characters
        string1 = re.sub("[^a-zA-Z0-9]", "", string1)
        string2 = re.sub("[^a-zA-Z0-9]", "", string2)

        utils.sim_check_for_zero_len(string1, string2)

        if utils.sim_check_for_exact_match(string1, string2):
            return 1

        string1, string2 = string1.upper(), string2.upper()
        first_letter1, first_letter2 = string1[0], string2[0]
        string1, string2 = string1[1:], string2[1:]

        # remove occurrences of vowels, 'y', 'w' and 'h'
        string1 = re.sub('[AEIOUYWH]', '', string1)
        string2 = re.sub('[AEIOUYWH]', '', string2)

        # replace (B,F,P,V)->1 (C,G,J,K,Q,S,X,Z)->2 (D,T)->3 (L)->4
        # (M,N)->5 (R)->6
        string1 = re.sub('[BFPV]', '1', string1)
        string1 = re.sub('[CGJKQSXZ]', '2', string1)
        string1 = re.sub('[DT]', '3', string1)
        string1 = re.sub('[L]', '4', string1)
        string1 = re.sub('[MN]', '5', string1)
        string1 = re.sub('[R]', '6', string1)

        string2 = re.sub('[BFPV]', '1', string2)
        string2 = re.sub('[CGJKQSXZ]', '2', string2)
        string2 = re.sub('[DT]', '3', string2)
        string2 = re.sub('[L]', '4', string2)
        string2 = re.sub('[MN]', '5', string2)
        string2 = re.sub('[R]', '6', string2)

        string1 = first_letter1 + string1[:3]
        string2 = first_letter2 + string2[:3]

        return 1 if string1 == string2 else 0

[docs]    def get_sim_score(self, string1, string2):
        """
        Computes the normalized soundex similarity between two strings.

        Args:
            string1,string2 (str): Input strings

        Returns:
            Normalized soundex similarity (int)

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> s = Soundex()
            >>> s.get_sim_score('Robert', 'Rupert')
            1
            >>> s.get_sim_score('Sue', 's')
            1
            >>> s.get_sim_score('Gough', 'Goff')
            0
            >>> s.get_sim_score('a,,li', 'ali')
            1

        """
        return self.get_raw_score(string1, string2)