Source code for deepmatcher.models.word_contextualizers

import six

import deepmatcher as dm
import torch
import torch.nn as nn
from torch.autograd import Variable

from . import _utils
from ..batch import AttrTensor


[docs]class RNN(dm.modules.RNN, dm.WordContextualizer):
    """Multi layered RNN based Word Contextualizer.

    Supports dropout and residual / highway connections. Takes the same parameters as the
    :class:`~deepmatcher.modules.RNN` module.
    """
    pass


# class CNN(dm.WordContextualizer):
#     pass


[docs]class SelfAttention(dm.WordContextualizer):
    """__init__(heads=1, hidden_size=None, input_dropout=0, alignment_network='decomposable', scale=False, score_dropout=0, value_transform_network=None, value_merge='concat', transform_dropout=0, output_transform_network=None, output_dropout=0, bypass_network='highway', input_size=None)

    Self Attention based Word Contextualizer.

    Supports `vanilla self attention <https://arxiv.org/abs/1606.01933>`__ and `multi-head
    self attention <https://arxiv.org/abs/1706.03762>`__.

    Args:
        heads (int):
            Number of attention heads to use. Defaults to 1.
        hidden_size (int):
            The default hidden size of the `alignment_network` and transform networks, if
            they are not disabled.
        input_dropout (float):
            If non-zero, applies dropout to the input to this module. Dropout probability
            must be between 0 and 1.
        alignment_network (string or :class:`deepmatcher.modules.AlignmentNetwork` or callable):
            The neural network takes the input sequence, aligns the words in the sequence
            with other words in the sequence, and returns the corresponding alignment
            score matrix. Argument must specify a :ref:`align-op` operation.
        scale (bool):
            Whether to scale the alignment scores by the square root of the
            `hidden_size` parameter. Based on `scaled dot-product attention
            <https://arxiv.org/abs/1706.03762>`__
        score_dropout (float):
            If non-zero, applies dropout to the alignment score matrix. Dropout
            probability must be between 0 and 1.
        value_transform_network (string or :class:`~deepmatcher.modules.Transform` or callable):
            For each word embedding in the input sequence, SelfAttention takes a weighted
            average of the aligning values, i.e., the aligning word embeddings based on
            the alignment scores. This parameter specifies the neural network to transform
            the values (word embeddings) before taking the weighted average. Argument must
            be None or specify a :ref:`transform-op` operation. If the argument is a
            string, the hidden size of the transform operation is computed as
            :code:`hidden_size // heads`. If argument is None, and `heads` is 1, then the
            values are not transformed. If argument is None and `heads` is > 1, then a 1
            layer highway network without any non-linearity is used. The hidden size for
            this is computed as mentioned above.
        value_merge (string or :class:`~deepmatcher.modules.Merge` or callable):
            For each word embedding in the input sequence, each SelfAttention head
            produces one corresponding vector as output. This parameter specifies how
            to merge the outputs of all attention heads for each word embedding.
            Concatenates the outputs of all heads by default. Argument must specify a
            :ref:`merge-op` operation.
        transform_dropout (float):
            If non-zero, applies dropout to the output of the `value_transform_network`,
            if applicable. Dropout probability must be between 0 and 1.
        output_transform_network (string or :class:`~deepmatcher.modules.Transform` or callable):
            For each word embedding in the input sequence, SelfAttention produces one
            corresponding vector as output. This neural network specifies how to transform
            each of these output vectors to obtain a hidden representation of size
            `hidden_size`. Argument must be None or specify a :ref:`transform-op`
            operation. If argument is None, and `heads` is 1, then the output vectors are
            not transformed. If argument is None and `heads` is > 1, then a 1 layer
            highway network without any non-linearity is used.
        output_dropout (float):
            If non-zero, applies dropout to the output of the `output_transform_network`,
            if applicable. Dropout probability must be between 0 and 1.
        bypass_network (string or :class:`Bypass` or callable):
            The bypass network (e.g. residual or highway network) to use. The input word
            embedding sequence to this module is considered as the raw input to the bypass
            network and the final output vector sequence (output of `value_merge` or
            `output_transform_network` if applicable) is considered as the transformed
            input. Argument must specify a :ref:`bypass-op` operation. If None, does not
            use a bypass network.
        input_size (int):
            The number of features in the input to the module. This parameter will be
            automatically specified by :class:`LazyModule`.
    """

    def _init(self,
              heads=1,
              hidden_size=None,
              input_dropout=0,
              alignment_network='decomposable',
              scale=False,
              score_dropout=0,
              value_transform_network=None,
              value_merge='concat',
              transform_dropout=0,
              output_transform_network=None,
              output_dropout=0,
              bypass_network='highway',
              input_size=None):
        hidden_size = hidden_size if hidden_size is not None else input_size

        self.alignment_networks = nn.ModuleList()
        for head in range(heads):
            self.alignment_networks.append(
                dm.modules._alignment_module(alignment_network, hidden_size))

        if value_transform_network is None and heads > 1:
            value_transform_network = dm.modules.Transform(
                '1-layer-highway', non_linearity=None, hidden_size=hidden_size // heads)
        self.value_transform_network = dm.modules._transform_module(
            value_transform_network, hidden_size // heads)

        self.value_merge = dm.modules._merge_module(value_merge)

        self.softmax = nn.Softmax(dim=2)

        if output_transform_network is None and heads > 1:
            output_transform_network = dm.modules.Transform(
                '1-layer-highway', non_linearity=None, hidden_size=hidden_size)
        self.output_transform_network = dm.modules._transform_module(
            output_transform_network, hidden_size)

        self.input_dropout = nn.Dropout(input_dropout)
        self.transform_dropout = nn.Dropout(transform_dropout)
        self.score_dropout = nn.Dropout(output_dropout)
        self.output_dropout = nn.Dropout(output_dropout)

        self.bypass_network = dm.modules._bypass_module(bypass_network)

        self.heads = heads
        self.scale = scale
        self.hidden_size = hidden_size

    def _forward(self, input_with_meta):
        input = self.input_dropout(input_with_meta.data)

        values_aligned = []
        for head in range(self.heads):
            # Dims: batch x len1 x len2
            alignment_scores = self.score_dropout(self.alignment_networks[head](input,
                                                                                input))

            if self.scale:
                alignment_scores = alignment_scores / torch.sqrt(self.hidden_size)

            if input_with_meta.lengths is not None:
                mask = _utils.sequence_mask(input_with_meta.lengths)
                mask = mask.unsqueeze(1)  # Make it broadcastable.
                alignment_scores.data.masked_fill_(1 - mask, -float('inf'))

            normalized_scores = self.softmax(alignment_scores)

            if self.value_transform_network is not None:
                values_transformed = self.transform_dropout(
                    self.value_transform_network(input))
            else:
                values_transformed = input

            # Dims: batch x len1 x channels
            values_aligned.append(torch.bmm(normalized_scores, values_transformed))

        values_merged = self.value_merge(*values_aligned)

        output = values_merged
        if self.output_transform_network:
            output = self.output_transform_network(output)
        output = self.output_dropout(output)

        final_output = self.bypass_network(output, input)

        return AttrTensor.from_old_metadata(final_output, input_with_meta)