Source code for texar.tf.modules.encoders.bert_encoder

# Copyright 2019 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
BERT encoders.
"""

import tensorflow as tf

from texar.tf.core.layers import get_initializer, get_layer
from texar.tf.modules.encoders.transformer_encoders import TransformerEncoder
from texar.tf.modules.embedders.embedders import WordEmbedder
from texar.tf.modules.embedders.position_embedders import PositionEmbedder
from texar.tf.modules.encoders.encoder_base import EncoderBase
from texar.tf.modules.pretrained.bert import PretrainedBERTMixin

__all__ = [
    "BERTEncoder",
]


[docs]class BERTEncoder(EncoderBase, PretrainedBERTMixin): r"""Raw BERT Transformer for encoding sequences. Please see :class:`~texar.tf.modules.PretrainedBERTMixin` for a brief description of BERT. This module basically stacks :class:`~texar.tf.modules.WordEmbedder`, :class:`~texar.tf.modules.PositionEmbedder`, :class:`~texar.tf.modules.TransformerEncoder` and a dense pooler. Args: pretrained_model_name (optional): a `str`, the name of pre-trained model (e.g., ``bert-base-uncased``). Please refer to :class:`~texar.tf.modules.PretrainedBERTMixin` for all supported models. If `None`, the model name in :attr:`hparams` is used. cache_dir (optional): the path to a folder in which the pre-trained models will be cached. If `None` (default), a default directory (``texar_data`` folder under user's home directory) will be used. hparams (dict or HParams, optional): Hyperparameters. Missing hyperparameter will be set to default values. See :meth:`default_hparams` for the hyperparameter structure and default values. .. document private functions .. automethod:: _build """ def __init__(self, pretrained_model_name=None, cache_dir=None, hparams=None): super(BERTEncoder, self).__init__(hparams=hparams) self.load_pretrained_config(pretrained_model_name, cache_dir) with tf.variable_scope(self.variable_scope): # Word embedding self.word_embedder = WordEmbedder( vocab_size=self._hparams.vocab_size, hparams=self._hparams.embed) # Segment embedding for each type of tokens self.segment_embedder = WordEmbedder( vocab_size=self._hparams.type_vocab_size, hparams=self._hparams.segment_embed) # Position embedding self.position_embedder = PositionEmbedder( position_size=self._hparams.position_size, hparams=self._hparams.position_embed) # The BERT encoder (a TransformerEncoder) self.encoder = TransformerEncoder(hparams=self._hparams.encoder) with tf.variable_scope("pooler"): kwargs_i = {"units": self._hparams.hidden_size, "activation": tf.tanh} layer_hparams = {"type": "Dense", "kwargs": kwargs_i} self.pooler = get_layer(hparams=layer_hparams)
[docs] def reset_parameters(self): with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( get_initializer(self._hparams.initializer))
[docs] @staticmethod def default_hparams(): r"""Returns a dictionary of hyperparameters with default values. * The encoder arch is determined by the constructor argument :attr:`pretrained_model_name` if it's specified. In this case, `hparams` are ignored. * Otherwise, the encoder arch is determined by `hparams['pretrained_model_name']` if it's specified. All other configurations in `hparams` are ignored. * If the above two are `None`, the encoder arch is defined by the configurations in `hparams` and weights are randomly initialized. .. code-block:: python { "pretrained_model_name": "bert-base-uncased", "embed": { "dim": 768, "name": "word_embeddings" }, "vocab_size": 30522, "segment_embed": { "dim": 768, "name": "token_type_embeddings" }, "type_vocab_size": 2, "position_embed": { "dim": 768, "name": "position_embeddings" }, "position_size": 512, "encoder": { "dim": 768, "embedding_dropout": 0.1, "multihead_attention": { "dropout_rate": 0.1, "name": "self", "num_heads": 12, "num_units": 768, "output_dim": 768, "use_bias": True }, "name": "encoder", "num_blocks": 12, "poswise_feedforward": { "layers": [ { "kwargs": { "activation": "gelu", "name": "intermediate", "units": 3072, "use_bias": True }, "type": "Dense" }, { "kwargs": {"activation": None, "name": "output", "units": 768, "use_bias": True }, "type": "Dense" } ] }, "residual_dropout": 0.1, "use_bert_config": True }, "hidden_size": 768, "initializer": None, "name": "bert_encoder" } Here: The default parameters are values for uncased BERT-Base model. `"pretrained_model_name"`: str or None The name of the pre-trained BERT model. If None, the model will be randomly initialized. `"embed"`: dict Hyperparameters for word embedding layer. `"vocab_size"`: int The vocabulary size of `inputs` in BERT model. `"segment_embed"`: dict Hyperparameters for segment embedding layer. `"type_vocab_size"`: int The vocabulary size of the `segment_ids` passed into `BertModel`. `"position_embed"`: dict Hyperparameters for position embedding layer. `"position_size"`: int The maximum sequence length that this model might ever be used with. `"encoder"`: dict Hyperparameters for the TransformerEncoder. See :func:`~texar.tf.modules.TransformerEncoder.default_harams` for details. `"hidden_size"`: int Size of the pooler dense layer. `"initializer"`: dict, optional Hyperparameters of the default initializer that initializes variables created in this module. See :func:`~texar.tf.core.get_initializer` for details. `"name"`: str Name of the module. """ return { 'pretrained_model_name': 'bert-base-uncased', 'embed': { 'dim': 768, 'name': 'word_embeddings' }, 'vocab_size': 30522, 'segment_embed': { 'dim': 768, 'name': 'token_type_embeddings' }, 'type_vocab_size': 2, 'position_embed': { 'dim': 768, 'name': 'position_embeddings' }, 'position_size': 512, 'encoder': { 'dim': 768, 'embedding_dropout': 0.1, 'multihead_attention': { 'dropout_rate': 0.1, 'name': 'self', 'num_heads': 12, 'num_units': 768, 'output_dim': 768, 'use_bias': True }, 'name': 'encoder', 'num_blocks': 12, 'poswise_feedforward': { 'layers': [ { 'kwargs': { 'activation': 'gelu', 'name': 'intermediate', 'units': 3072, 'use_bias': True }, 'type': 'Dense' }, { 'kwargs': { 'activation': None, 'name': 'output', 'units': 768, 'use_bias': True }, 'type': 'Dense' } ] }, 'residual_dropout': 0.1, 'use_bert_config': True }, 'hidden_size': 768, 'initializer': None, 'name': 'bert_encoder', '@no_typecheck': ['pretrained_model_name'] }
[docs] def _build(self, inputs, sequence_length=None, segment_ids=None, mode=None, **kwargs): """Encodes the inputs. Args: inputs: A 2D Tensor of shape `[batch_size, max_time]`, containing the token ids of tokens in the input sequences. segment_ids (optional): A 2D Tensor of shape `[batch_size, max_time]`, containing the segment ids of tokens in input sequences. If `None` (default), a tensor with all elements set to zero is used. sequence_length (optional): A 1D Tensor of shape `[batch_size]`. Input tokens beyond respective sequence lengths are masked out automatically. mode (optional): A tensor taking value in :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including `TRAIN`, `EVAL`, and `PREDICT`. Used to toggle dropout. If `None` (default), :func:`texar.tf.global_mode` is used. **kwargs: Keyword arguments. Returns: A pair :attr:`(outputs, pooled_output)` - :attr:`outputs`: A Tensor of shape \ `[batch_size, max_time, dim]` containing the \ encoded vectors. - :attr:`pooled_output`: A Tensor of size \ `[batch_size, hidden_size]` which is the output of a \ pooler berts on top of the hidden state associated \ to the first character of the input (`CLS`), see BERT's \ paper. """ if segment_ids is None: segment_ids = tf.zeros_like(inputs) word_embeds = self.word_embedder(inputs) segment_embeds = self.segment_embedder(segment_ids) batch_size = tf.shape(inputs)[0] pos_length = tf.ones([batch_size], tf.int32) * tf.shape(inputs)[1] pos_embeds = self.position_embedder(sequence_length=pos_length) input_embeds = word_embeds + segment_embeds + pos_embeds if sequence_length is None: sequence_length = tf.ones([batch_size], tf.int32) \ * tf.shape(inputs)[1] output = self.encoder(input_embeds, sequence_length, mode) with tf.variable_scope("pooler"): # taking the hidden state corresponding to the first token. first_token_tensor = tf.squeeze(output[:, 0:1, :], axis=1) pooled_output = self.pooler(first_token_tensor) if not self._built: self._add_internal_trainable_variables() self._built = True self.init_pretrained_weights(self.variable_scope.name) return output, pooled_output