Source code for py_stringmatching.similarity_measure.editex

# coding=utf-8
"""Editex distance measure"""

from __future__ import division
from __future__ import unicode_literals
import unicodedata
import six

import numpy as np

from py_stringmatching import utils
from six.moves import xrange
from six import text_type
from py_stringmatching.similarity_measure.sequence_similarity_measure import \
                                                    SequenceSimilarityMeasure


[docs]class Editex(SequenceSimilarityMeasure): """Editex distance measure class. Parameters: match_cost (int): Weight to give the correct char match, default=0 group_cost (int): Weight to give if the chars are in the same editex group, default=1 mismatch_cost (int): Weight to give the incorrect char match, default=2 local (boolean): Local variant on/off, default=False """ def __init__(self, match_cost=0, group_cost=1, mismatch_cost=2, local=False): self.match_cost = match_cost self.group_cost = group_cost self.mismatch_cost = mismatch_cost self.local = local super(Editex, self).__init__()
[docs] def get_raw_score(self, string1, string2): """ Computes the editex distance between two strings. As described on pages 3 & 4 of Zobel, Justin and Philip Dart. 1996. Phonetic string matching: Lessons from information retrieval. In: Proceedings of the ACM-SIGIR Conference on Research and Development in Information Retrieval, Zurich, Switzerland. 166–173. http://goanna.cs.rmit.edu.au/~jz/fulltext/sigir96.pdf The local variant is based on Ring, Nicholas and Alexandra L. Uitdenbogerd. 2009. Finding ‘Lucy in Disguise’: The Misheard Lyric Matching Problem. In: Proceedings of the 5th Asia Information Retrieval Symposium, Sapporo, Japan. 157-167. http://www.seg.rmit.edu.au/research/download.php?manuscript=404 Args: string1,string2 (str): Input strings Returns: Editex distance (int) Raises: TypeError : If the inputs are not strings Examples: >>> ed = Editex() >>> ed.get_raw_score('cat', 'hat') 2 >>> ed.get_raw_score('Niall', 'Neil') 2 >>> ed.get_raw_score('aluminum', 'Catalan') 12 >>> ed.get_raw_score('ATCG', 'TAGC') 6 References: * Abydos Library - https://github.com/chrislit/abydos/blob/master/abydos/distance.py """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) if utils.sim_check_for_exact_match(string1, string2): return 0 # convert both the strings to NFKD normalized unicode string1 = unicodedata.normalize('NFKD', text_type(string1.upper())) string2 = unicodedata.normalize('NFKD', text_type(string2.upper())) # convert ß to SS (for Python2) string1 = string1.replace('ß', 'SS') string2 = string2.replace('ß', 'SS') if len(string1) == 0: return len(string2) * self.mismatch_cost if len(string2) == 0: return len(string1) * self.mismatch_cost d_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.int) len1 = len(string1) len2 = len(string2) string1 = ' ' + string1 string2 = ' ' + string2 editex_helper = EditexHelper(self.match_cost, self.mismatch_cost, self.group_cost) if not self.local: for i in xrange(1, len1 + 1): d_mat[i, 0] = d_mat[i - 1, 0] + editex_helper.d_cost( string1[i - 1], string1[i]) for j in xrange(1, len2 + 1): d_mat[0, j] = d_mat[0, j - 1] + editex_helper.d_cost(string2[j - 1], string2[j]) for i in xrange(1, len1 + 1): for j in xrange(1, len2 + 1): d_mat[i, j] = min(d_mat[i - 1, j] + editex_helper.d_cost( string1[i - 1], string1[i]), d_mat[i, j - 1] + editex_helper.d_cost( string2[j - 1], string2[j]), d_mat[i - 1, j - 1] + editex_helper.r_cost( string1[i], string2[j])) return d_mat[len1, len2]
[docs] def get_sim_score(self, string1, string2): """ Computes the normalized editex similarity between two strings. Args: string1,string2 (str): Input strings Returns: Normalized editex similarity (float) Raises: TypeError : If the inputs are not strings Examples: >>> ed = Editex() >>> ed.get_sim_score('cat', 'hat') 0.66666666666666674 >>> ed.get_sim_score('Niall', 'Neil') 0.80000000000000004 >>> ed.get_sim_score('aluminum', 'Catalan') 0.25 >>> ed.get_sim_score('ATCG', 'TAGC') 0.25 References: * Abydos Library - https://github.com/chrislit/abydos/blob/master/abydos/distance.py """ raw_score = self.get_raw_score(string1, string2) string1_len = len(string1) string2_len = len(string2) if string1_len == 0 and string2_len == 0: return 1.0 return 1 - (raw_score / max(string1_len * self.mismatch_cost, string2_len * self.mismatch_cost))
[docs] def get_match_cost(self): """ Get match cost Returns: match cost (int) """ return self.match_cost
[docs] def get_group_cost(self): """ Get group cost Returns: group cost (int) """ return self.group_cost
[docs] def get_mismatch_cost(self): """ Get mismatch cost Returns: mismatch cost (int) """ return self.mismatch_cost
[docs] def get_local(self): """ Get local flag Returns: local flag (boolean) """ return self.local
[docs] def set_match_cost(self, match_cost): """ Set match cost Args: match_cost (int): Weight to give the correct char match """ self.match_cost = match_cost return True
[docs] def set_group_cost(self, group_cost): """ Set group cost Args: group_cost (int): Weight to give if the chars are in the same editex group """ self.group_cost = group_cost return True
[docs] def set_mismatch_cost(self, mismatch_cost): """ Set mismatch cost Args: mismatch_cost (int): Weight to give the incorrect char match """ self.mismatch_cost = mismatch_cost return True
[docs] def set_local(self, local): """ Set local flag Args: local (boolean): Local variant on/off """ self.local = local return True
class EditexHelper: letter_groups = dict() letter_groups['A'] = letter_groups['E'] = letter_groups['I'] = letter_groups['O'] \ = letter_groups['U'] = letter_groups['Y'] = 0 letter_groups['B'] = letter_groups['P'] = 1 letter_groups['C'] = letter_groups['K'] = letter_groups['Q'] = 2 letter_groups['D'] = letter_groups['T'] = 3 letter_groups['L'] = letter_groups['R'] = 4 letter_groups['M'] = letter_groups['N'] = 5 letter_groups['G'] = letter_groups['J'] = 6 letter_groups['F'] = letter_groups['P'] = letter_groups['V'] = 7 letter_groups['S'] = letter_groups['X'] = letter_groups['Z'] = 8 letter_groups['C'] = letter_groups['S'] = letter_groups['J'] = 9 all_letters = frozenset('AEIOUYBPCKQDTLRMNGJFVSXZ') def __init__(self, match_cost, mismatch_cost, group_cost): self.match_cost = match_cost self.mismatch_cost = mismatch_cost self.group_cost = group_cost def r_cost(self, ch1, ch2): """Return r(a,b) according to Zobel & Dart's definition """ if ch1 == ch2: return self.match_cost if ch1 in EditexHelper.all_letters and ch2 in EditexHelper.all_letters: if (EditexHelper.letter_groups[ch1] == EditexHelper.letter_groups[ch2]): return self.group_cost return self.mismatch_cost def d_cost(self, ch1, ch2): """Return d(a,b) according to Zobel & Dart's definition """ if ch1 != ch2 and (ch1 == 'H' or ch1 == 'W'): return self.group_cost return self.r_cost(ch1, ch2)