Source code for texar.tf.modules.policies.policy_nets

# Copyright 2018 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Policy models based on feed forward networks.
"""

import numpy as np

import tensorflow as tf
from tensorflow_probability import distributions as tfpd

from texar.tf.module_base import ModuleBase
from texar.tf.agents.agent_utils import Space
from texar.tf.utils import utils
from texar.tf.utils.dtypes import get_tf_dtype

# pylint: disable=no-member

__all__ = [
    'PolicyNetBase',
    'CategoricalPolicyNet'
]


[docs]class PolicyNetBase(ModuleBase):
    """Policy net that takes in states and outputs actions.

    Args:
        network (optional): A network that takes in state and returns
            outputs for generating actions. For example, an instance of subclass
            of :class:`~texar.tf.modules.FeedForwardNetworkBase`. If `None`,
            a network is created as specified in :attr:`hparams`.
        network_kwargs (dict, optional): Keyword arguments for network
            constructor.
            Note that the `hparams` argument for network
            constructor is specified in the "network_hparams" field of
            :attr:`hparams` and should not be included in `network_kwargs`.
            Ignored if :attr:`network` is given.
        hparams (dict or HParams, optional): Hyperparameters. Missing
            hyperparamerter will be set to default values. See
            :meth:`default_hparams` for the hyperparameter sturcture and
            default values.
    """
    def __init__(self,
                 network=None,
                 network_kwargs=None,
                 hparams=None):
        ModuleBase.__init__(self, hparams=hparams)

        with tf.variable_scope(self.variable_scope):
            self._build_network(network, network_kwargs)

[docs]    @staticmethod
    def default_hparams():
        """Returns a dictionary of hyperparameters with default values.

        .. role:: python(code)
           :language: python

        .. code-block:: python

            {
                'network_type': 'FeedForwardNetwork',
                'network_hparams': {
                    'layers': [
                        {
                            'type': 'Dense',
                            'kwargs': {'units': 256, 'activation': 'relu'}
                        },
                        {
                            'type': 'Dense',
                            'kwargs': {'units': 256, 'activation': 'relu'}
                        },
                    ]
                },
                'distribution_kwargs': None,
                'name': 'policy_net',
            }

        Here:

        "network_type": str or class or instance
            A network that takes in state and returns outputs for
            generating actions. This can be a class, its name or module path,
            or a class instance. Ignored if `network` is given to the
            constructor.

        "network_hparams": dict
            Hyperparameters for the network. With the :attr:`network_kwargs`
            argument to the constructor, a network is created with
            :python:`network_class(**network_kwargs, hparams=network_hparams)`.

            For example, the default values creates a two-layer dense network.

        "distribution_kwargs": dict, optional
            Keyword arguments for distribution constructor. A distribution
            would be created for action sampling.

        "name": str
            Name of the policy.
        """
        return {
            'network_type': 'FeedForwardNetwork',
            'network_hparams': {
                'layers': [
                    {
                        'type': 'Dense',
                        'kwargs': {'units': 256, 'activation': 'relu'}
                    },
                    {
                        'type': 'Dense',
                        'kwargs': {'units': 256, 'activation': 'relu'}
                    },
                ]
            },
            'distribution_kwargs': None,
            'name': 'policy_net',
            '@no_typecheck': ['network_type', 'network_hparams']
        }

    def _build_network(self, network, kwargs):
        if network is not None:
            self._network = network
        else:
            kwargs = utils.get_instance_kwargs(
                kwargs, self._hparams.network_hparams)
            self._network = utils.check_or_get_instance(
                self._hparams.network_type,
                kwargs,
                module_paths=['texar.tf.modules', 'texar.tf.custom'])

    def _build(self, inputs, mode=None):  # pylint: disable=arguments-differ
        raise NotImplementedError

    @property
    def network(self):
        """The network.
        """
        return self._network


# TODO(zhiting): Allow structured discrete actions.
[docs]class CategoricalPolicyNet(PolicyNetBase):
    """Policy net with Categorical distribution for discrete scalar actions.

    This is a combination of a network with a top-layer distribution for
    action sampling.

    Args:
        action_space (optional): An instance of :class:`~texar.tf.agents.Space`
            specifying the action space. If not given, an discrete action space
            `[0, high]` is created with `high` specified in :attr:`hparams`.
        network (optional): A network that takes in state and returns
            outputs for generating actions. For example, an instance of subclass
            of :class:`~texar.tf.modules.FeedForwardNetworkBase`. If `None`,
            a network is created as specified in :attr:`hparams`.
        network_kwargs (dict, optional): Keyword arguments for network
            constructor.
            Note that the `hparams` argument for network
            constructor is specified in the "network_hparams" field of
            :attr:`hparams` and should not be included in `network_kwargs`.
            Ignored if :attr:`network` is given.
        hparams (dict or HParams, optional): Hyperparameters. Missing
            hyperparamerter will be set to default values. See
            :meth:`default_hparams` for the hyperparameter sturcture and
            default values.

    .. document private functions
    .. automethod:: _build
    """

    def __init__(self,
                 action_space=None,
                 network=None,
                 network_kwargs=None,
                 hparams=None):
        PolicyNetBase.__init__(self, hparams=hparams)

        with tf.variable_scope(self.variable_scope):
            if action_space is None:
                action_space = Space(
                    low=0, high=self._hparams.action_space, dtype=np.int32)
            self._action_space = action_space
            self._append_output_layer()

[docs]    @staticmethod
    def default_hparams():
        """Returns a dictionary of hyperparameters with default values.

        .. code-block:: python

            {
                'network_type': 'FeedForwardNetwork',
                'network_hparams': {
                    'layers': [
                        {
                            'type': 'Dense',
                            'kwargs': {'units': 256, 'activation': 'relu'}
                        },
                        {
                            'type': 'Dense',
                            'kwargs': {'units': 256, 'activation': 'relu'}
                        },
                    ]
                },
                'distribution_kwargs': {
                    'dtype': 'int32',
                    'validate_args': False,
                    'allow_nan_stats': True
                },
                'action_space': 2,
                'make_output_layer': True,
                'name': 'categorical_policy_net'
            }

        Here:

        "distribution_kwargs": dict
            Keyword arguments for the :tf_main:`Categorical
            <distributions/Categorical>` distribution constructor. Arguments
            `logits` and `probs` should not be included as they are inferred
            from the inputs. Argument `dtype` can be a string (e.g., `int32`)
            and will be converted to a corresponding tf dtype.

        "action_space": int
            Upper bound of the action space. The resulting action space is
            all discrete scalar numbers between 0 and the upper bound specified
            here (both inclusive).

        "make_output_layer": bool
            Whether to append a dense layer to the network to transform
            features to logits for action sampling. If `False`, the final layer
            output of network must match the action space.

        See :class:`~texar.tf.modules.PolicyNetBase.default_hparams` for details
        of other hyperparameters.
        """
        hparams = PolicyNetBase.default_hparams()
        hparams.update({
            'distribution_kwargs': {
                'dtype': 'int32',
                'validate_args': False,
                'allow_nan_stats': True
            },
            'action_space': 2,
            'make_output_layer': True,
            'name': 'categorical_policy_net'
        })
        return hparams

    def _append_output_layer(self):
        if not self._hparams.make_output_layer:
            return

        if self._action_space.shape != ():
            raise ValueError('Only scalar discrete action is supported.')
        else:
            output_size = self._action_space.high - self._action_space.low

        layer_hparams = {
            'type': 'Dense',
            'kwargs': {'units': output_size}
        }
        self._network.append_layer(layer_hparams)

[docs]    def _build(self, inputs, mode=None):
        """Takes in states and outputs actions.

        Args:
            inputs: Inputs to the policy network with the first dimension
                the batch dimension.
            mode (optional): A tensor taking value in
                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
                `TRAIN`, `EVAL`, and `PREDICT`. If `None`,
                :func:`texar.tf.global_mode` is used.

        Returns
            A `dict` including fields `"logits"`, `"action"`, and `"dist"`,
            where

            - **"logits"**: A Tensor of shape \
            `[batch_size] + action_space size` used for categorical \
            distribution sampling.
            - **"action"**: A Tensor of shape \
            `[batch_size] + action_space.shape`.
            - **"dist"**: The \
            :tf_main:`Categorical <distributions/Categorical>` based on the \
            logits.
        """
        logits = self._network(inputs, mode=mode)

        dkwargs = self._hparams.distribution_kwargs.todict()
        dkwargs['dtype'] = get_tf_dtype(dkwargs['dtype'])
        dist = tfpd.Categorical(logits=logits, **dkwargs)

        action = dist.sample()
        to_shape = [-1]  # for batch dimension
        to_shape.extend(list(self._action_space.shape))
        action = tf.reshape(action, to_shape)

        outputs = {
            "logits": logits,
            "action": action,
            "dist": dist
        }

        if not self._built:
            self._add_internal_trainable_variables()
            self._add_trainable_variable(self._network.trainable_variables)
            self._built = True

        return outputs

    @property
    def action_space(self):
        """An instance of :class:`~texar.tf.agents.Space` specifiying the
        action space.
        """
        return self._action_space