Source code for py_stringmatching.similarity_measure.soundex

"""Soundex phonetic similarity measure"""

import re

from py_stringmatching import utils
from py_stringmatching.similarity_measure.phonetic_similarity_measure import \
                                                    PhoneticSimilarityMeasure


[docs]class Soundex(PhoneticSimilarityMeasure): """Soundex phonetic similarity measure class. """ def __init__(self): super(Soundex, self).__init__()
[docs] def get_raw_score(self, string1, string2): """ Computes the Soundex phonetic similarity between two strings. Phonetic measure such as soundex match string based on their sound. These measures have been especially effective in matching names, since names are often spelled in different ways that sound the same. For example, Meyer, Meier, and Mire sound the same, as do Smith, Smithe, and Smythe. Soundex is used primarily to match surnames. It does not work as well for names of East Asian origins, because much of the discriminating power of these names resides in the vowel sounds, which the code ignores. Args: string1,string2 (str): Input strings Returns: Soundex similarity score (int) is returned Raises: TypeError : If the inputs are not strings Examples: >>> s = Soundex() >>> s.get_raw_score('Robert', 'Rupert') 1 >>> s.get_raw_score('Sue', 's') 1 >>> s.get_raw_score('Gough', 'Goff') 0 >>> s.get_raw_score('a,,li', 'ali') 1 """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) # remove all chars but alphanumeric characters string1 = re.sub("[^a-zA-Z0-9]", "", string1) string2 = re.sub("[^a-zA-Z0-9]", "", string2) utils.sim_check_for_zero_len(string1, string2) if utils.sim_check_for_exact_match(string1, string2): return 1 string1, string2 = string1.upper(), string2.upper() first_letter1, first_letter2 = string1[0], string2[0] string1, string2 = string1[1:], string2[1:] # remove occurrences of vowels, 'y', 'w' and 'h' string1 = re.sub('[AEIOUYWH]', '', string1) string2 = re.sub('[AEIOUYWH]', '', string2) # replace (B,F,P,V)->1 (C,G,J,K,Q,S,X,Z)->2 (D,T)->3 (L)->4 # (M,N)->5 (R)->6 string1 = re.sub('[BFPV]', '1', string1) string1 = re.sub('[CGJKQSXZ]', '2', string1) string1 = re.sub('[DT]', '3', string1) string1 = re.sub('[L]', '4', string1) string1 = re.sub('[MN]', '5', string1) string1 = re.sub('[R]', '6', string1) string2 = re.sub('[BFPV]', '1', string2) string2 = re.sub('[CGJKQSXZ]', '2', string2) string2 = re.sub('[DT]', '3', string2) string2 = re.sub('[L]', '4', string2) string2 = re.sub('[MN]', '5', string2) string2 = re.sub('[R]', '6', string2) string1 = first_letter1 + string1[:3] string2 = first_letter2 + string2[:3] return 1 if string1 == string2 else 0
[docs] def get_sim_score(self, string1, string2): """ Computes the normalized soundex similarity between two strings. Args: string1,string2 (str): Input strings Returns: Normalized soundex similarity (int) Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> s = Soundex() >>> s.get_sim_score('Robert', 'Rupert') 1 >>> s.get_sim_score('Sue', 's') 1 >>> s.get_sim_score('Gough', 'Goff') 0 >>> s.get_sim_score('a,,li', 'ali') 1 """ return self.get_raw_score(string1, string2)