Source code for py_stringmatching.similarity_measure.affine

import numpy as np

from py_stringmatching import utils
from six.moves import xrange
from py_stringmatching.similarity_measure.sequence_similarity_measure import \
                                                    SequenceSimilarityMeasure


def sim_ident(char1, char2):
    return int(char1 == char2)


[docs]class Affine(SequenceSimilarityMeasure): """Returns the affine gap score between two strings. The affine gap measure is an extension of the Needleman-Wunsch measure that handles the longer gaps more gracefully. For more information refer to the string matching chapter in the DI book ("Principles of Data Integration"). Args: gap_start (float): Cost for the gap at the start (defaults to 1). gap_continuation (float): Cost for the gap continuation (defaults to 0.5). sim_func (function): Function computing similarity score between two characters, which are represented as strings (defaults to an identity function, which returns 1 if the two characters are the same and returns 0 otherwise). Attributes: gap_start (float): An attribute to store the gap cost at the start. gap_continuation (float): An attribute to store the gap continuation cost. sim_func (function): An attribute to store the similarity function. """ def __init__(self, gap_start=1, gap_continuation=0.5, sim_func=sim_ident): self.gap_start = gap_start self.gap_continuation = gap_continuation self.sim_func = sim_func super(Affine, self).__init__()
[docs] def get_raw_score(self, string1, string2): """Computes the affine gap score between two strings. This score can be outside the range [0,1]. Args: string1,string2 (str) : Input strings. Returns: Affine gap score betwen the two input strings (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> aff = Affine() >>> aff.get_raw_score('dva', 'deeva') 1.5 >>> aff = Affine(gap_start=2, gap_continuation=0.5) >>> aff.get_raw_score('dva', 'deeve') -0.5 >>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0))) >>> aff.get_raw_score('AAAGAATTCA', 'AAATCA') 4.4 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 gap_start = -self.gap_start gap_continuation = -self.gap_continuation m = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) x = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) # DP initialization for i in xrange(1, len(string1) + 1): m[i][0] = -float("inf") x[i][0] = gap_start + (i - 1) * gap_continuation y[i][0] = -float("inf") # DP initialization for j in xrange(1, len(string2) + 1): m[0][j] = -float("inf") x[0][j] = -float("inf") y[0][j] = gap_start + (j - 1) * gap_continuation # affine gap calculation using DP for i in xrange(1, len(string1) + 1): for j in xrange(1, len(string2) + 1): # best score between x_1....x_i and y_1....y_j # given that x_i is aligned to y_j m[i][j] = (self.sim_func(string1[i - 1], string2[j - 1]) + max(m[i - 1][j - 1], x[i - 1][j - 1], y[i - 1][j - 1])) # the best score given that x_i is aligned to a gap x[i][j] = max(gap_start + m[i - 1][j], gap_continuation + x[i - 1][j]) # the best score given that y_j is aligned to a gap y[i][j] = max(gap_start + m[i][j - 1], gap_continuation + y[i][j - 1]) return max(m[len(string1)][len(string2)], x[len(string1)][len(string2)], y[len(string1)][len(string2)])
[docs] def get_gap_start(self): """Get gap start cost. Returns: gap start cost (float). """ return self.gap_start
[docs] def get_gap_continuation(self): """Get gap continuation cost. Returns: gap continuation cost (float). """ return self.gap_continuation
[docs] def get_sim_func(self): """Get similarity function. Returns: similarity function (function). """ return self.sim_func
[docs] def set_gap_start(self, gap_start): """Set gap start cost. Args: gap_start (float): Cost for the gap at the start. """ self.gap_start = gap_start return True
[docs] def set_gap_continuation(self, gap_continuation): """Set gap continuation cost. Args: gap_continuation (float): Cost for the gap continuation. """ self.gap_continuation = gap_continuation return True
[docs] def set_sim_func(self, sim_func): """Set similarity function. Args: sim_func (function): Function computing similarity score between two characters, represented as strings. """ self.sim_func = sim_func return True