Source code for py_stringmatching.similarity_measure.levenshtein

from __future__ import division

from py_stringmatching import utils
from py_stringmatching.similarity_measure.cython.cython_levenshtein import levenshtein
from py_stringmatching.similarity_measure.sequence_similarity_measure import \
    SequenceSimilarityMeasure


[docs]class Levenshtein(SequenceSimilarityMeasure): """Computes Levenshtein measure (also known as edit distance). Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string is carried out using a sequence of the following operators: delete a character, insert a character, and substitute one character for another. """ def __init__(self): super(Levenshtein, self).__init__()
[docs] def get_raw_score(self, string1, string2): """Computes the raw Levenshtein distance between two strings. Args: string1,string2 (str): Input strings. Returns: Levenshtein distance (int). Raises: TypeError : If the inputs are not strings. Examples: >>> lev = Levenshtein() >>> lev.get_raw_score('a', '') 1 >>> lev.get_raw_score('example', 'samples') 3 >>> lev.get_raw_score('levenshtein', 'frankenstein') 6 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) if utils.sim_check_for_exact_match(string1, string2): return 0.0 return levenshtein(string1, string2)
[docs] def get_sim_score(self, string1, string2): """Computes the normalized Levenshtein similarity score between two strings. Args: string1,string2 (str): Input strings. Returns: Normalized Levenshtein similarity (float). Raises: TypeError : If the inputs are not strings. Examples: >>> lev = Levenshtein() >>> lev.get_sim_score('a', '') 0.0 >>> lev.get_sim_score('example', 'samples') 0.5714285714285714 >>> lev.get_sim_score('levenshtein', 'frankenstein') 0.5 """ # convert input strings to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) raw_score = self.get_raw_score(string1, string2) max_len = max(len(string1), len(string2)) if max_len == 0: return 1.0 return 1 - (raw_score / max_len)