Source code for deepmatcher.models.word_contextualizers

import six

import deepmatcher as dm
import torch
import torch.nn as nn
from torch.autograd import Variable

from . import _utils
from ..batch import AttrTensor


[docs]class RNN(dm.modules.RNN, dm.WordContextualizer): """Multi layered RNN based Word Contextualizer. Supports dropout and residual / highway connections. Takes the same parameters as the :class:`~deepmatcher.modules.RNN` module. """ pass
# class CNN(dm.WordContextualizer): # pass
[docs]class SelfAttention(dm.WordContextualizer): """__init__(heads=1, hidden_size=None, input_dropout=0, alignment_network='decomposable', scale=False, score_dropout=0, value_transform_network=None, value_merge='concat', transform_dropout=0, output_transform_network=None, output_dropout=0, bypass_network='highway', input_size=None) Self Attention based Word Contextualizer. Supports `vanilla self attention <https://arxiv.org/abs/1606.01933>`__ and `multi-head self attention <https://arxiv.org/abs/1706.03762>`__. Args: heads (int): Number of attention heads to use. Defaults to 1. hidden_size (int): The default hidden size of the `alignment_network` and transform networks, if they are not disabled. input_dropout (float): If non-zero, applies dropout to the input to this module. Dropout probability must be between 0 and 1. alignment_network (string or :class:`deepmatcher.modules.AlignmentNetwork` or callable): The neural network takes the input sequence, aligns the words in the sequence with other words in the sequence, and returns the corresponding alignment score matrix. Argument must specify a :ref:`align-op` operation. scale (bool): Whether to scale the alignment scores by the square root of the `hidden_size` parameter. Based on `scaled dot-product attention <https://arxiv.org/abs/1706.03762>`__ score_dropout (float): If non-zero, applies dropout to the alignment score matrix. Dropout probability must be between 0 and 1. value_transform_network (string or :class:`~deepmatcher.modules.Transform` or callable): For each word embedding in the input sequence, SelfAttention takes a weighted average of the aligning values, i.e., the aligning word embeddings based on the alignment scores. This parameter specifies the neural network to transform the values (word embeddings) before taking the weighted average. Argument must be None or specify a :ref:`transform-op` operation. If the argument is a string, the hidden size of the transform operation is computed as :code:`hidden_size // heads`. If argument is None, and `heads` is 1, then the values are not transformed. If argument is None and `heads` is > 1, then a 1 layer highway network without any non-linearity is used. The hidden size for this is computed as mentioned above. value_merge (string or :class:`~deepmatcher.modules.Merge` or callable): For each word embedding in the input sequence, each SelfAttention head produces one corresponding vector as output. This parameter specifies how to merge the outputs of all attention heads for each word embedding. Concatenates the outputs of all heads by default. Argument must specify a :ref:`merge-op` operation. transform_dropout (float): If non-zero, applies dropout to the output of the `value_transform_network`, if applicable. Dropout probability must be between 0 and 1. output_transform_network (string or :class:`~deepmatcher.modules.Transform` or callable): For each word embedding in the input sequence, SelfAttention produces one corresponding vector as output. This neural network specifies how to transform each of these output vectors to obtain a hidden representation of size `hidden_size`. Argument must be None or specify a :ref:`transform-op` operation. If argument is None, and `heads` is 1, then the output vectors are not transformed. If argument is None and `heads` is > 1, then a 1 layer highway network without any non-linearity is used. output_dropout (float): If non-zero, applies dropout to the output of the `output_transform_network`, if applicable. Dropout probability must be between 0 and 1. bypass_network (string or :class:`Bypass` or callable): The bypass network (e.g. residual or highway network) to use. The input word embedding sequence to this module is considered as the raw input to the bypass network and the final output vector sequence (output of `value_merge` or `output_transform_network` if applicable) is considered as the transformed input. Argument must specify a :ref:`bypass-op` operation. If None, does not use a bypass network. input_size (int): The number of features in the input to the module. This parameter will be automatically specified by :class:`LazyModule`. """ def _init(self, heads=1, hidden_size=None, input_dropout=0, alignment_network='decomposable', scale=False, score_dropout=0, value_transform_network=None, value_merge='concat', transform_dropout=0, output_transform_network=None, output_dropout=0, bypass_network='highway', input_size=None): hidden_size = hidden_size if hidden_size is not None else input_size self.alignment_networks = nn.ModuleList() for head in range(heads): self.alignment_networks.append( dm.modules._alignment_module(alignment_network, hidden_size)) if value_transform_network is None and heads > 1: value_transform_network = dm.modules.Transform( '1-layer-highway', non_linearity=None, hidden_size=hidden_size // heads) self.value_transform_network = dm.modules._transform_module( value_transform_network, hidden_size // heads) self.value_merge = dm.modules._merge_module(value_merge) self.softmax = nn.Softmax(dim=2) if output_transform_network is None and heads > 1: output_transform_network = dm.modules.Transform( '1-layer-highway', non_linearity=None, hidden_size=hidden_size) self.output_transform_network = dm.modules._transform_module( output_transform_network, hidden_size) self.input_dropout = nn.Dropout(input_dropout) self.transform_dropout = nn.Dropout(transform_dropout) self.score_dropout = nn.Dropout(output_dropout) self.output_dropout = nn.Dropout(output_dropout) self.bypass_network = dm.modules._bypass_module(bypass_network) self.heads = heads self.scale = scale self.hidden_size = hidden_size def _forward(self, input_with_meta): input = self.input_dropout(input_with_meta.data) values_aligned = [] for head in range(self.heads): # Dims: batch x len1 x len2 alignment_scores = self.score_dropout(self.alignment_networks[head](input, input)) if self.scale: alignment_scores = alignment_scores / torch.sqrt(self.hidden_size) if input_with_meta.lengths is not None: mask = _utils.sequence_mask(input_with_meta.lengths) mask = mask.unsqueeze(1) # Make it broadcastable. alignment_scores.data.masked_fill_(1 - mask, -float('inf')) normalized_scores = self.softmax(alignment_scores) if self.value_transform_network is not None: values_transformed = self.transform_dropout( self.value_transform_network(input)) else: values_transformed = input # Dims: batch x len1 x channels values_aligned.append(torch.bmm(normalized_scores, values_transformed)) values_merged = self.value_merge(*values_aligned) output = values_merged if self.output_transform_network: output = self.output_transform_network(output) output = self.output_dropout(output) final_output = self.bypass_network(output, input) return AttrTensor.from_old_metadata(final_output, input_with_meta)