Source code for texar.tf.core.optimization

# Copyright 2018 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Various optimization related utilities.
"""

import re
import tensorflow as tf

from texar.tf.hyperparams import HParams
from texar.tf.utils import utils

# pylint: disable=too-many-arguments, no-member

__all__ = [
    "default_optimization_hparams",
    "get_optimizer_fn",
    "get_learning_rate_decay_fn",
    "get_gradient_clip_fn",
    "get_optimizer",
    "get_train_op",
    "AdamWeightDecayOptimizer",
]


[docs]def default_optimization_hparams(): """Returns a `dict` of default hyperparameters of training op and their default values .. role:: python(code) :language: python .. code-block:: python { "optimizer": { "type": "AdamOptimizer", "kwargs": { "learning_rate": 0.001 } }, "learning_rate_decay": { "type": "", "kwargs": {}, "min_learning_rate": 0., "start_decay_step": 0, "end_decay_step": inf }, "gradient_clip": { "type": "", "kwargs": {} }, "gradient_noise_scale": None, "name": None } Here: "optimizer": dict Hyperparameters of a :tf_main:`tf.train.Optimizer <train/Optimizer>`. - **"type"** specifies the optimizer class. This can be - The string name or full module path of an optimizer class. \ If the class name is provided, the class must be in module \ :tf_main:`tf.train <train>`, \ :tf_main:`tf.contrib.opt <contrib/opt>` or :mod:`texar.tf.custom` \ , :mod:`texar.tf.core.optimization` - An optimizer class. - An instance of an optimizer class. For example .. code-block:: python "type": "AdamOptimizer" # class name "type": "my_module.MyOptimizer" # module path "type": tf.contrib.opt.AdamWOptimizer # class "type": my_module.MyOptimizer # class "type": GradientDescentOptimizer(learning_rate=0.1) # instance "type": MyOptimizer(...) # instance - **"kwargs"** is a `dict` specifying keyword arguments for creating \ the optimizer class instance, with :python:`opt_class(**kwargs)`. \ Ignored if "type" is a class instance. "learning_rate_decay": dict Hyperparameters of learning rate decay function. The learning rate starts decay from :attr:`"start_decay_step"` and keeps unchanged after :attr:`"end_decay_step"` or reaching :attr:`"min_learning_rate"`. The decay function is specified in "type" and "kwargs". - "type" can be a decay function or its name or module path. If \ function name is provided, it must be from module \ :tf_main:`tf.train <train>` or :mod:`texar.tf.custom`, \ :mod:`texar.tf.core.optimization`. - "kwargs" is a `dict` of keyword arguments for the function \ excluding arguments named "global_step" and "learning_rate". The function is called with :python:`lr = decay_fn(learning_rate=lr, global_step=offset_step, **kwargs)`, where `offset_step` is the global step offset as above. The only exception is :tf_main:`tf.train.piecewise_constant <train/piecewise_constant>` which is called with :python:`lr = piecewise_constant(x=offset_step, **kwargs)`. "gradient_clip": dict Hyperparameters of gradient clipping. The gradient clipping function takes a list of `(gradients, variables)` tuples and returns a list of `(clipped_gradients, variables)` tuples. Typical examples include :tf_main:`tf.clip_by_global_norm <clip_by_global_norm>`, :tf_main:`tf.clip_by_value <clip_by_value>`, :tf_main:`tf.clip_by_norm <clip_by_norm>`, :tf_main:`tf.clip_by_average_norm <clip_by_average_norm>`, etc. "type" specifies the gradient clip function, and can be a function, or its name or mudule path. If function name is provided, the function must be from module :tf_main:`tf < >` or :mod:`texar.tf.custom`, :mod:`texar.tf.core.optimization`. "kwargs" specifies keyword arguments to the function, except arguments named "t" or "t_list". The function is called with :python:`clipped_grads(, _) = clip_fn(t_list=grads, **kwargs)` (e.g., for :tf_main:`tf.clip_by_global_norm <clip_by_global_norm>`) or :python:`clipped_grads = [clip_fn(t=grad, **kwargs) for grad in grads]` (e.g., for :tf_main:`tf.clip_by_value <clip_by_value>`). "gradient_noise_scale": float, optional Adds 0-mean normal noise scaled by this value to gradient. """ return { "optimizer": { "type": "AdamOptimizer", "kwargs": { "learning_rate": 0.001 } }, "learning_rate_decay": { "type": "", "kwargs": {}, "min_learning_rate": 0., "start_decay_step": 0, "end_decay_step": utils.MAX_SEQ_LENGTH, }, "gradient_clip": { "type": "", "kwargs": {} }, "gradient_noise_scale": None, # TODO(zhiting): allow module-level control of gradient_multipliers "name": None }
[docs]def get_optimizer_fn(hparams=None): """Returns a function `optimizer_fn` of making optimizer instance, along with the optimizer class. .. role:: python(code) :language: python The function has the signiture :python:`optimizer_fn(learning_rate=None) -> optimizer class instance` See the :attr:`"optimizer"` field of :meth:`~texar.tf.core.default_optimization_hparams` for all hyperparameters and default values. The optimizer class must be a subclass of :tf_main:`tf.train.Optimizer <train/Optimizer>`. Args: hparams (dict or HParams, optional): hyperparameters. Missing hyperparameters are set to default values automatically. Returns: - If hparams["type"] is a string or optimizer class, returns\ `(optimizer_fn, optimizer class)`, - If hparams["type"] is an optimizer instance, returns \ `(the optimizer instance, optimizer class)` """ if hparams is None or isinstance(hparams, dict): hparams = HParams( hparams, default_optimization_hparams()["optimizer"]) opt = hparams["type"] if isinstance(opt, tf.train.Optimizer): return opt, type(opt) opt_modules = ['tensorflow.train', 'tensorflow.contrib.opt', 'texar.tf.core.optimization', 'texar.tf.custom'] try: opt_class = utils.check_or_get_class(opt, opt_modules, tf.train.Optimizer) except TypeError: raise ValueError( "Unrecognized optimizer. Must be string name of the " "optimizer class, or the class which is a subclass of " "tf.train.Optimizer, or an instance of the subclass of " "Optimizer.") def _get_opt(learning_rate=None): opt_kwargs = hparams["kwargs"].todict() fn_args = set(utils.get_args(opt_class.__init__)) if 'learning_rate' in fn_args and learning_rate is not None: opt_kwargs["learning_rate"] = learning_rate return opt_class(**opt_kwargs) return _get_opt, opt_class
[docs]def get_learning_rate_decay_fn(hparams=None): """Creates learning rate decay function based on the hyperparameters. See the :attr:`learning_rate_decay` field in :meth:`~texar.tf.core.default_optimization_hparams` for all hyperparameters and default values. Args: hparams (dict or HParams, optional): hyperparameters. Missing hyperparameters are set to default values automatically. Returns: function or None: If hparams["type"] is specified, returns a function that takes `(learning_rate, step, **kwargs)` and returns a decayed learning rate. If hparams["type"] is empty, returns `None`. """ if hparams is None or isinstance(hparams, dict): hparams = HParams( hparams, default_optimization_hparams()["learning_rate_decay"]) fn_type = hparams["type"] if fn_type is None or fn_type == "": return None fn_modules = ["tensorflow.train", "texar.tf.custom"] decay_fn = utils.get_function(fn_type, fn_modules) fn_kwargs = hparams["kwargs"] if fn_kwargs is HParams: fn_kwargs = fn_kwargs.todict() start_step = tf.cast(hparams["start_decay_step"], tf.int32) end_step = tf.cast(hparams["end_decay_step"], tf.int32) def lr_decay_fn(learning_rate, global_step): """Learning rate decay function. Args: learning_rate (float or Tensor): The original learning rate. global_step (int or scalar int Tensor): optimization step counter. Returns: scalar float Tensor: decayed learning rate. """ offset_global_step = tf.maximum( tf.minimum(tf.cast(global_step, tf.int32), end_step) - start_step, 0) if decay_fn == tf.train.piecewise_constant: decayed_lr = decay_fn(x=offset_global_step, **fn_kwargs) else: fn_kwargs_ = { "learning_rate": learning_rate, "global_step": offset_global_step} fn_kwargs_.update(fn_kwargs) decayed_lr = utils.call_function_with_redundant_kwargs( decay_fn, fn_kwargs_) decayed_lr = tf.maximum(decayed_lr, hparams["min_learning_rate"]) return decayed_lr return lr_decay_fn
[docs]def get_gradient_clip_fn(hparams=None): """Creates a gradient clipping function based on the hyperparameters. See the :attr:`gradient_clip` field in :meth:`~texar.tf.core.default_optimization_hparams` for all hyperparameters and default values. The gradient clipping function takes a list of `(gradients, variables)` tuples and returns a list of `(clipped_gradients, variables)` tuples. Typical examples include :tf_main:`tf.clip_by_global_norm <clip_by_global_norm>`, :tf_main:`tf.clip_by_value <clip_by_value>`, :tf_main:`tf.clip_by_norm <clip_by_norm>`, :tf_main:`tf.clip_by_average_norm <clip_by_average_norm>`, etc. Args: hparams (dict or HParams, optional): hyperparameters. Missing hyperparameters are set to default values automatically. Returns: function or `None`: If hparams["type"] is specified, returns the respective function. If hparams["type"] is empty, returns `None`. """ if hparams is None or isinstance(hparams, dict): hparams = HParams( hparams, default_optimization_hparams()["gradient_clip"]) fn_type = hparams["type"] if fn_type is None or fn_type == "": return None fn_modules = ["tensorflow", "texar.tf.custom"] clip_fn = utils.get_function(fn_type, fn_modules) clip_fn_args = utils.get_args(clip_fn) fn_kwargs = hparams["kwargs"] if isinstance(fn_kwargs, HParams): fn_kwargs = fn_kwargs.todict() def grad_clip_fn(grads_and_vars): """Gradient clipping function. Args: grads_and_vars (list): A list of `(gradients, variables)` tuples. Returns: list: A list of `(clipped_gradients, variables)` tuples. """ grads, vars_ = zip(*grads_and_vars) if clip_fn == tf.clip_by_global_norm: clipped_grads, _ = clip_fn(t_list=grads, **fn_kwargs) elif 't_list' in clip_fn_args: clipped_grads = clip_fn(t_list=grads, **fn_kwargs) elif 't' in clip_fn_args: # e.g., tf.clip_by_value clipped_grads = [clip_fn(t=grad, **fn_kwargs) for grad in grads] return list(zip(clipped_grads, vars_)) return grad_clip_fn
def _get_static_lr(learning_rate=None, optimizer_class=None, hparams=None): """Return the base static learning_rate. A helper function for creating the optimization function. """ hparams = HParams(hparams, default_optimization_hparams()) opt_hparams = hparams['optimizer'] if learning_rate is None: learning_rate = opt_hparams["kwargs"].get("learning_rate", None) if learning_rate is None: # Try to get learning_rate from the default value of the # optimizer's argument opt_argspec = utils.get_default_arg_values(optimizer_class.__init__) learning_rate = opt_argspec.get("learning_rate", None) return learning_rate
[docs]def get_optimizer(learning_rate=None, global_step=None, hparams=None): """Creates a optimizer instance. Args: learning_rate (float or Tensor, optional): If `None`, learning rate specified in :attr:`hparams`, or the default learning rate of the optimizer (if exists) is used. global_step (optional): A scalar int Tensor. Step counter to update on each step unless :attr:`increment_global_step` is `False`. Learning rate decay uses :attr:`global_step`. If `None`, it will be fetched from the default graph (see :tf_main:`tf.train.get_global_step <train/get_global_step>` for more details). If it has not been created, no step will be incremented with each weight update. hparams (dict or HParams, optional): hyperparameters. Missing hyperparameters are set to default values automatically. See :func:`~texar.tf.core.default_optimization_hparams` for all hyperparameters and default values. Returns: optimizer: the tf.train.Optimizer instance specified in hparams. """ hparams = HParams(hparams, default_optimization_hparams()) opt_hparams = hparams["optimizer"] optimizer_fn, optimizer_class = get_optimizer_fn(opt_hparams) static_lr = _get_static_lr(learning_rate, optimizer_class, hparams) lr_decay_fn = get_learning_rate_decay_fn(hparams["learning_rate_decay"]) if lr_decay_fn is not None: learning_rate = lr_decay_fn(learning_rate=static_lr, global_step=global_step) else: learning_rate = static_lr tf.summary.scalar("learning_rate", learning_rate) optimizer = optimizer_fn(learning_rate=learning_rate) return optimizer
[docs]def get_train_op(loss, variables=None, optimizer=None, learning_rate=None, global_step=None, increment_global_step=True, hparams=None): """Creates a training op. This is a wrapper of :tf_main:`tf.contrib.layers.optimize_loss <contrib/layers/optimize_loss>`. Args: loss: A scalar Tensor representing the loss to minimize. variables (optional): A list of Variables to optimize. If `None`, all trainable variables are used. optimizer (optional): An tf.train.Optimizer instance. If `None`, use the setting in `hparams` to create the optimizer. learning_rate (float or Tensor, optional): If `None`, learning rate specified in :attr:`hparams`, or the default learning rate of the optimizer will be used (if exists). global_step (optional): A scalar int Tensor. Step counter to update on each step unless :attr:`increment_global_step` is `False`. Learning rate decay uses :attr:`global_step`. If `None`, it will be fetched from the default graph (see :tf_main:`tf.train.get_global_step <train/get_global_step>` for more details). If it has not been created, no step will be incremented with each weight update. increment_global_step (bool): Whether to increment :attr:`global_step`. This is useful if the :attr:`global_step` is used in multiple training ops per training step (e.g. to optimize different parts of the model) to avoid incrementing :attr:`global_step` more times than necessary. hparams (dict or HParams, optional): hyperparameters. Missing hyperparameters are set to default values automatically. See :func:`~texar.tf.core.default_optimization_hparams` for all hyperparameters and default values. Returns: train_op: the operator used for variables optimization. """ hparams = HParams(hparams, default_optimization_hparams()) grad_clip_fn = get_gradient_clip_fn(hparams["gradient_clip"]) if not isinstance(optimizer, tf.train.Optimizer): opt_hparams = hparams["optimizer"] optimizer_fn, optimizer_class = get_optimizer_fn(opt_hparams) learning_rate = _get_static_lr(learning_rate, optimizer_class, hparams) lr_decay_fn = get_learning_rate_decay_fn( hparams["learning_rate_decay"]) train_op = tf.contrib.layers.optimize_loss( loss=loss, global_step=global_step, learning_rate=learning_rate, optimizer=optimizer_fn, gradient_noise_scale=hparams["gradient_noise_scale"], clip_gradients=grad_clip_fn, learning_rate_decay_fn=lr_decay_fn, variables=variables, name=hparams["name"], increment_global_step=increment_global_step) else: train_op = tf.contrib.layers.optimize_loss( loss=loss, global_step=global_step, learning_rate=None, optimizer=optimizer, gradient_noise_scale=hparams["gradient_noise_scale"], clip_gradients=grad_clip_fn, variables=variables, name=hparams["name"], increment_global_step=increment_global_step) return train_op
class AdamWeightDecayOptimizer(tf.train.Optimizer): """ A basic Adam optimizer that includes "correct" L2 weight decay. Copied from the google BERT repo. Except that in `apply_gradient` function, we add the support to increment the passed global step parameter, to make it more compatible to tf.train.Optimizer implementation. """ def __init__(self, learning_rate, weight_decay_rate=0.0, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=None, name="AdamWeightDecayOptimizer"): """Constructs a AdamWeightDecayOptimizer.""" super(AdamWeightDecayOptimizer, self).__init__(False, name) self.learning_rate = learning_rate self.weight_decay_rate = weight_decay_rate self.beta_1 = beta_1 self.beta_2 = beta_2 self.epsilon = epsilon self.exclude_from_weight_decay = exclude_from_weight_decay # pylint: disable=too-many-locals def apply_gradients(self, grads_and_vars, global_step=None, name=None): """See base class.""" with tf.name_scope(name, self._name) as name: assignments = [] for (grad, param) in grads_and_vars: if grad is None or param is None: continue param_name = self._get_variable_name(param.name) m = tf.get_variable( name=param_name + "/adam_m", shape=param.shape.as_list(), dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer()) v = tf.get_variable( name=param_name + "/adam_v", shape=param.shape.as_list(), dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer()) # Standard Adam update. next_m = (tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) next_v = (tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, tf.square(grad))) update = next_m / (tf.sqrt(next_v) + self.epsilon) # Just adding the square of the weights to the loss function is # *not* the correct way of using L2 regularization/weight decay # with Adam, since that will interact with the m and v # parameters in strange ways. # Instead we want ot decay the weights in a manner that doesn't # interact with the m/v parameters. # This is equivalent to adding the square # of the weights to the loss with plain (non-momentum) SGD. if self._do_use_weight_decay(param_name): update += self.weight_decay_rate * param update_with_lr = self.learning_rate * update next_param = param - update_with_lr assignments.extend( [param.assign(next_param), m.assign(next_m), v.assign(next_v)]) update_ops = assignments if global_step is None: apply_updates = self._finish(update_ops, name) else: with tf.control_dependencies([self._finish(update_ops, "update")]): with tf.colocate_with(global_step): apply_updates = tf.assign_add(global_step, 1, name=name) return apply_updates def _do_use_weight_decay(self, param_name): """Whether to use L2 weight decay for `param_name`.""" if not self.weight_decay_rate: return False if self.exclude_from_weight_decay: for r in self.exclude_from_weight_decay: if re.search(r, param_name) is not None: return False return True def _get_variable_name(self, param_name): """Get the variable name from the tensor name.""" m = re.match("^(.*):\\d+$", param_name) if m is not None: param_name = m.group(1) return param_name