Source code for

# Copyright 2019 The Texar Authors. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
Utils of BERT Modules.

from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import collections
import json
import os
import re

from abc import ABCMeta

import tensorflow as tf

from import PretrainedMixin

__all__ = [


[docs]class PretrainedBERTMixin(PretrainedMixin): r"""A mixin class to support loading pre-trained checkpoints for modules that implement the BERT model. The BERT model was proposed in (`Devlin et al`. 2018) `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ . A bidirectional Transformer language model pre-trained on large text corpora. Available model names include: * ``bert-base-uncased``: 12-layer, 768-hidden, 12-heads, 110M parameters. * ``bert-large-uncased``: 24-layer, 1024-hidden, 16-heads, 340M parameters. * ``bert-base-cased``: 12-layer, 768-hidden, 12-heads , 110M parameters. * ``bert-large-cased``: 24-layer, 1024-hidden, 16-heads, 340M parameters. * ``bert-base-multilingual-uncased``: 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters. * ``bert-base-multilingual-cased``: 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters. * ``bert-base-chinese``: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters. We provide the following BERT classes: * :class:`` for text encoding. * :class:`` for text classification and sequence tagging. .. _`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`: """ __metaclass__ = ABCMeta _MODEL_NAME = "BERT" _MODEL2URL = { 'bert-base-uncased': _BERT_PATH + "2018_10_18/", 'bert-large-uncased': _BERT_PATH + "2018_10_18/", 'bert-base-cased': _BERT_PATH + "2018_10_18/", 'bert-large-cased': _BERT_PATH + "2018_10_18/", 'bert-base-multilingual-uncased': _BERT_PATH + "2018_11_23/", 'bert-base-multilingual-cased': _BERT_PATH + "2018_11_03/", 'bert-base-chinese': _BERT_PATH + "2018_11_03/", } @classmethod def _transform_config(cls, pretrained_model_name, cache_dir): info = list(os.walk(cache_dir)) root, _, files = info[0] config_path = None for file in files: if file.endswith('config.json'): config_path = os.path.join(root, file) with open(config_path) as f: config_ckpt = json.loads( if config_path is None: raise ValueError("Cannot find the config file in {}".format( cache_dir)) configs = {} hidden_dim = config_ckpt['hidden_size'] configs['hidden_size'] = config_ckpt['hidden_size'] configs['embed'] = { 'name': 'word_embeddings', 'dim': hidden_dim} configs['vocab_size'] = config_ckpt['vocab_size'] configs['segment_embed'] = { 'name': 'token_type_embeddings', 'dim': hidden_dim} configs['type_vocab_size'] = config_ckpt['type_vocab_size'] configs['position_embed'] = { 'name': 'position_embeddings', 'dim': hidden_dim} configs['position_size'] = config_ckpt['max_position_embeddings'] configs['encoder'] = { 'name': 'encoder', 'embedding_dropout': config_ckpt['hidden_dropout_prob'], 'num_blocks': config_ckpt['num_hidden_layers'], 'multihead_attention': { 'use_bias': True, 'num_units': hidden_dim, 'num_heads': config_ckpt['num_attention_heads'], 'output_dim': hidden_dim, 'dropout_rate': config_ckpt['attention_probs_dropout_prob'], 'name': 'self' }, 'residual_dropout': config_ckpt['hidden_dropout_prob'], 'dim': hidden_dim, 'use_bert_config': True, 'poswise_feedforward': { "layers": [ { 'type': 'Dense', 'kwargs': { 'name': 'intermediate', 'units': config_ckpt['intermediate_size'], 'activation': config_ckpt['hidden_act'], 'use_bias': True, } }, { 'type': 'Dense', 'kwargs': { 'name': 'output', 'units': hidden_dim, 'activation': None, 'use_bias': True, } }, ], }, } return configs def _init_from_checkpoint(self, pretrained_model_name, cache_dir, scope_name, **kwargs): tvars = tf.trainable_variables() init_checkpoint = os.path.abspath(os.path.join(cache_dir, 'bert_model.ckpt')) if init_checkpoint: assignment_map, initialized_variable_names = \ self._get_assignment_map_from_checkpoint( tvars, init_checkpoint, scope_name) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) def _get_assignment_map_from_checkpoint(self, tvars, init_checkpoint, scope_name): r"""`` Compute the union of the current variables and checkpoint variables. Because the variable scope of the original BERT and Texar implementation, we need to build a assignment map to match the variables. """ initialized_variable_names = {} name_to_variable = collections.OrderedDict() for var in tvars: name = m = re.match("^(.*):\\d+$", name) if m is not None: name = name_to_variable[name] = var init_vars = tf.train.list_variables(init_checkpoint) assignment_map = { 'bert/embeddings/word_embeddings': scope_name + '/word_embeddings/w', 'bert/embeddings/token_type_embeddings': scope_name + '/token_type_embeddings/w', 'bert/embeddings/position_embeddings': scope_name + '/position_embeddings/w', 'bert/embeddings/LayerNorm/beta': scope_name + '/encoder/LayerNorm/beta', 'bert/embeddings/LayerNorm/gamma': scope_name + '/encoder/LayerNorm/gamma', } for check_name, model_name in assignment_map.items(): initialized_variable_names[model_name] = 1 initialized_variable_names[model_name + ":0"] = 1 for check_name, _ in init_vars: if check_name.startswith('bert'): if check_name.startswith('bert/embeddings'): continue check_name_scope = check_name.replace("bert/", scope_name + '/') model_name = re.sub( 'layer_\\d+/output/dense', lambda x:'output/dense', 'ffn/output'), check_name_scope) if model_name == check_name_scope: model_name = re.sub( 'layer_\\d+/output/LayerNorm', lambda x:'output/LayerNorm', 'ffn/LayerNorm'), check_name_scope) if model_name == check_name_scope: model_name = re.sub( 'layer_\\d+/intermediate/dense', lambda x:'intermediate/dense', 'ffn/intermediate'), check_name_scope) if model_name == check_name_scope: model_name = re.sub('attention/output/dense', 'attention/self/output', check_name_scope) if model_name == check_name_scope: model_name = check_name_scope.replace( 'attention/output/LayerNorm', 'output/LayerNorm') if model_name in name_to_variable.keys(): assignment_map[check_name] = model_name initialized_variable_names[model_name] = 1 initialized_variable_names[model_name + ":0"] = 1 else: 'model name:{} not exist'.format(model_name)) return assignment_map, initialized_variable_names