Source code for texar.tf.modules.encoders.bert_encoder

# Copyright 2019 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
BERT encoders.
"""

import tensorflow as tf

from texar.tf.core.layers import get_initializer, get_layer
from texar.tf.modules.encoders.transformer_encoders import TransformerEncoder
from texar.tf.modules.embedders.embedders import WordEmbedder
from texar.tf.modules.embedders.position_embedders import PositionEmbedder
from texar.tf.modules.encoders.encoder_base import EncoderBase
from texar.tf.modules.pretrained.bert import PretrainedBERTMixin

__all__ = [
    "BERTEncoder",
]


[docs]class BERTEncoder(EncoderBase, PretrainedBERTMixin):
    r"""Raw BERT Transformer for encoding sequences. Please see
    :class:`~texar.tf.modules.PretrainedBERTMixin` for a brief description
    of BERT.

    This module basically stacks
    :class:`~texar.tf.modules.WordEmbedder`,
    :class:`~texar.tf.modules.PositionEmbedder`,
    :class:`~texar.tf.modules.TransformerEncoder` and a dense pooler.

    Args:
        pretrained_model_name (optional): a `str`, the name
            of pre-trained model (e.g., ``bert-base-uncased``). Please refer to
            :class:`~texar.tf.modules.PretrainedBERTMixin` for
            all supported models.
            If `None`, the model name in :attr:`hparams` is used.
        cache_dir (optional): the path to a folder in which the
            pre-trained models will be cached. If `None` (default),
            a default directory (``texar_data`` folder under user's home
            directory) will be used.
        hparams (dict or HParams, optional): Hyperparameters. Missing
            hyperparameter will be set to default values. See
            :meth:`default_hparams` for the hyperparameter structure
            and default values.

    .. document private functions
    .. automethod:: _build
    """

    def __init__(self,
                 pretrained_model_name=None,
                 cache_dir=None,
                 hparams=None):
        super(BERTEncoder, self).__init__(hparams=hparams)

        self.load_pretrained_config(pretrained_model_name, cache_dir)

        with tf.variable_scope(self.variable_scope):

            # Word embedding
            self.word_embedder = WordEmbedder(
                vocab_size=self._hparams.vocab_size,
                hparams=self._hparams.embed)

            # Segment embedding for each type of tokens
            self.segment_embedder = WordEmbedder(
                vocab_size=self._hparams.type_vocab_size,
                hparams=self._hparams.segment_embed)

            # Position embedding
            self.position_embedder = PositionEmbedder(
                position_size=self._hparams.position_size,
                hparams=self._hparams.position_embed)

            # The BERT encoder (a TransformerEncoder)
            self.encoder = TransformerEncoder(hparams=self._hparams.encoder)

            with tf.variable_scope("pooler"):
                kwargs_i = {"units": self._hparams.hidden_size,
                            "activation": tf.tanh}
                layer_hparams = {"type": "Dense", "kwargs": kwargs_i}
                self.pooler = get_layer(hparams=layer_hparams)

[docs]    def reset_parameters(self):
        with tf.variable_scope(self.variable_scope):
            if self._hparams.initializer:
                tf.get_variable_scope().set_initializer(
                    get_initializer(self._hparams.initializer))

[docs]    @staticmethod
    def default_hparams():
        r"""Returns a dictionary of hyperparameters with default values.

        * The encoder arch is determined by the constructor argument
          :attr:`pretrained_model_name` if it's specified. In this case,
          `hparams` are ignored.
        * Otherwise, the encoder arch is determined by
          `hparams['pretrained_model_name']` if it's specified. All other
          configurations in `hparams` are ignored.
        * If the above two are `None`, the encoder arch is defined by the
          configurations in `hparams` and weights are randomly initialized.

        .. code-block:: python

            {
                "pretrained_model_name": "bert-base-uncased",
                "embed": {
                    "dim": 768,
                    "name": "word_embeddings"
                },
                "vocab_size": 30522,
                "segment_embed": {
                    "dim": 768,
                    "name": "token_type_embeddings"
                },
                "type_vocab_size": 2,
                "position_embed": {
                    "dim": 768,
                    "name": "position_embeddings"
                },
                "position_size": 512,

                "encoder": {
                    "dim": 768,
                    "embedding_dropout": 0.1,
                    "multihead_attention": {
                        "dropout_rate": 0.1,
                        "name": "self",
                        "num_heads": 12,
                        "num_units": 768,
                        "output_dim": 768,
                        "use_bias": True
                    },
                    "name": "encoder",
                    "num_blocks": 12,
                    "poswise_feedforward": {
                        "layers": [
                            {   "kwargs": {
                                    "activation": "gelu",
                                    "name": "intermediate",
                                    "units": 3072,
                                    "use_bias": True
                                },
                                "type": "Dense"
                            },
                            {   "kwargs": {"activation": None,
                                "name": "output",
                                "units": 768,
                                "use_bias": True
                                },
                                "type": "Dense"
                            }
                        ]
                    },
                    "residual_dropout": 0.1,
                    "use_bert_config": True
                },
                "hidden_size": 768,
                "initializer": None,
                "name": "bert_encoder"
            }

        Here:

        The default parameters are values for uncased BERT-Base model.

        `"pretrained_model_name"`: str or None
            The name of the pre-trained BERT model. If None, the model
            will be randomly initialized.

        `"embed"`: dict
            Hyperparameters for word embedding layer.

        `"vocab_size"`: int
            The vocabulary size of `inputs` in BERT model.

        `"segment_embed"`: dict
            Hyperparameters for segment embedding layer.

        `"type_vocab_size"`: int
            The vocabulary size of the `segment_ids` passed into `BertModel`.

        `"position_embed"`: dict
            Hyperparameters for position embedding layer.

        `"position_size"`: int
            The maximum sequence length that this model might ever be used with.

        `"encoder"`: dict
            Hyperparameters for the TransformerEncoder.
            See :func:`~texar.tf.modules.TransformerEncoder.default_harams`
            for details.

        `"hidden_size"`: int
            Size of the pooler dense layer.

        `"initializer"`: dict, optional
            Hyperparameters of the default initializer that initializes
            variables created in this module.
            See :func:`~texar.tf.core.get_initializer` for details.

        `"name"`: str
            Name of the module.
        """

        return {
            'pretrained_model_name': 'bert-base-uncased',
            'embed': {
                'dim': 768,
                'name': 'word_embeddings'
            },
            'vocab_size': 30522,
            'segment_embed': {
                'dim': 768,
                'name': 'token_type_embeddings'
            },
            'type_vocab_size': 2,
            'position_embed': {
                'dim': 768,
                'name': 'position_embeddings'
            },
            'position_size': 512,

            'encoder': {
                'dim': 768,
                'embedding_dropout': 0.1,
                'multihead_attention': {
                    'dropout_rate': 0.1,
                    'name': 'self',
                    'num_heads': 12,
                    'num_units': 768,
                    'output_dim': 768,
                    'use_bias': True
                },
                'name': 'encoder',
                'num_blocks': 12,
                'poswise_feedforward': {
                    'layers': [
                        {
                            'kwargs': {
                                'activation': 'gelu',
                                'name': 'intermediate',
                                'units': 3072,
                                'use_bias': True
                            },
                            'type': 'Dense'
                        },
                        {
                            'kwargs': {
                                'activation': None,
                                'name': 'output',
                                'units': 768,
                                'use_bias': True
                            },
                            'type': 'Dense'
                        }
                    ]
                },
                'residual_dropout': 0.1,
                'use_bert_config': True
            },
            'hidden_size': 768,
            'initializer': None,
            'name': 'bert_encoder',
            '@no_typecheck': ['pretrained_model_name']
        }

[docs]    def _build(self,
               inputs,
               sequence_length=None,
               segment_ids=None,
               mode=None,
               **kwargs):
        """Encodes the inputs.

        Args:
            inputs: A 2D Tensor of shape `[batch_size, max_time]`,
                containing the token ids of tokens in the input sequences.
            segment_ids (optional): A 2D Tensor of shape
                `[batch_size, max_time]`, containing the segment ids
                of tokens in input sequences. If `None` (default), a
                tensor with all elements set to zero is used.
            sequence_length (optional): A 1D Tensor of shape `[batch_size]`.
                Input tokens beyond respective sequence lengths are masked
                out automatically.
            mode (optional): A tensor taking value in
                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`,
                including `TRAIN`, `EVAL`, and `PREDICT`. Used to toggle
                dropout.
                If `None` (default), :func:`texar.tf.global_mode` is used.
            **kwargs: Keyword arguments.

        Returns:
            A pair :attr:`(outputs, pooled_output)`

                - :attr:`outputs`:  A Tensor of shape \
                `[batch_size, max_time, dim]` containing the \
                 encoded vectors.

                - :attr:`pooled_output`: A Tensor of size \
                `[batch_size, hidden_size]` which is the output of a \
                pooler berts on top of the hidden state associated \
                to the first character of the input (`CLS`), see BERT's \
                paper.
        """

        if segment_ids is None:
            segment_ids = tf.zeros_like(inputs)

        word_embeds = self.word_embedder(inputs)

        segment_embeds = self.segment_embedder(segment_ids)

        batch_size = tf.shape(inputs)[0]
        pos_length = tf.ones([batch_size], tf.int32) * tf.shape(inputs)[1]
        pos_embeds = self.position_embedder(sequence_length=pos_length)

        input_embeds = word_embeds + segment_embeds + pos_embeds

        if sequence_length is None:
            sequence_length = tf.ones([batch_size], tf.int32) \
                              * tf.shape(inputs)[1]

        output = self.encoder(input_embeds, sequence_length, mode)

        with tf.variable_scope("pooler"):
            # taking the hidden state corresponding to the first token.
            first_token_tensor = tf.squeeze(output[:, 0:1, :], axis=1)
            pooled_output = self.pooler(first_token_tensor)

        if not self._built:
            self._add_internal_trainable_variables()
            self._built = True

            self.init_pretrained_weights(self.variable_scope.name)

        return output, pooled_output