Source code for py_stringmatching.similarity_measure.smith_waterman

# coding=utf-8

import numpy as np

from py_stringmatching import utils
from six.moves import xrange
from py_stringmatching.similarity_measure.sequence_similarity_measure import \
                                                    SequenceSimilarityMeasure


def sim_ident(char1, char2):
    return int(char1 == char2)


[docs]class SmithWaterman(SequenceSimilarityMeasure): """Computes Smith-Waterman measure. The Smith-Waterman algorithm performs local sequence alignment; that is, for determining similar regions between two strings. Instead of looking at the total sequence, the Smith–Waterman algorithm compares segments of all possible lengths and optimizes the similarity measure. See the string matching chapter in the DI book (Principles of Data Integration). Args: gap_cost (float): Cost of gap (defaults to 1.0). sim_func (function): Similarity function to give a score for the correspondence between the characters (defaults to an identity function, which returns 1 if the two characters are the same and 0 otherwise). Attributes: gap_cost (float): An attribute to store the gap cost. sim_func (function): An attribute to store the similarity function. """ def __init__(self, gap_cost=1.0, sim_func=sim_ident): self.gap_cost = gap_cost self.sim_func = sim_func super(SmithWaterman, self).__init__()
[docs] def get_raw_score(self, string1, string2): """Computes the raw Smith-Waterman score between two strings. Args: string1,string2 (str) : Input strings. Returns: Smith-Waterman similarity score (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> sw = SmithWaterman() >>> sw.get_raw_score('cat', 'hat') 2.0 >>> sw = SmithWaterman(gap_cost=2.2) >>> sw.get_raw_score('dva', 'deeve') 1.0 >>> sw = SmithWaterman(gap_cost=1, sim_func=lambda s1, s2 : (2 if s1 == s2 else -1)) >>> sw.get_raw_score('dva', 'deeve') 2.0 >>> sw = SmithWaterman(gap_cost=1.4, sim_func=lambda s1, s2 : (1.5 if s1 == s2 else 0.5)) >>> sw.get_raw_score('GCATAGCU', 'GATTACA') 6.5 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) max_value = 0 # Smith Waterman DP calculations for i in xrange(1, len(string1) + 1): for j in xrange(1, len(string2) + 1): match = dist_mat[i - 1, j - 1] + self.sim_func(string1[i - 1], string2[j - 1]) delete = dist_mat[i - 1, j] - self.gap_cost insert = dist_mat[i, j - 1] - self.gap_cost dist_mat[i, j] = max(0, match, delete, insert) max_value = max(max_value, dist_mat[i, j]) return max_value
[docs] def get_gap_cost(self): """Get gap cost. Returns: Gap cost (float). """ return self.gap_cost
[docs] def get_sim_func(self): """Get similarity function. Returns: Similarity function (function). """ return self.sim_func
[docs] def set_gap_cost(self, gap_cost): """Set gap cost. Args: gap_cost (float): Cost of gap. """ self.gap_cost = gap_cost return True
[docs] def set_sim_func(self, sim_func): """Set similarity function. Args: sim_func (function): Similarity function to give a score for the correspondence between the characters. """ self.sim_func = sim_func return True