Source code for texar.tf.data.data.data_base

# Copyright 2018 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Base data class that is enherited by all data classes.
A data defines data reading, parsing, batching, and other
preprocessing operations.
"""

import tensorflow as tf

from texar.tf.hyperparams import HParams
from texar.tf.data.data import dataset_utils as dsutils
from texar.tf.data.data_utils import count_file_lines

__all__ = [
    "DataBase"
]


[docs]class DataBase(object):
    """Base class inheritted by all data classes.
    """

    def __init__(self, hparams):
        self._hparams = HParams(hparams, self.default_hparams())

[docs]    @staticmethod
    def default_hparams():
        """Returns a dictionary of default hyperparameters.

        .. code-block:: python

            {
                "num_epochs": 1,
                "batch_size": 64,
                "allow_smaller_final_batch": True,
                "shuffle": True,
                "shuffle_buffer_size": None,
                "shard_and_shuffle": False,
                "num_parallel_calls": 1,
                "prefetch_buffer_size": 0,
                "max_dataset_size": -1,
                "seed": None,
                "name": "data",
            }

        Here:

            "num_epochs": int
                Number of times the dataset should be repeated. An
                :tf_main:`OutOfRangeError <errors/OutOfRangeError>` signal will
                be raised after the whole repeated dataset has been iterated
                through.

                E.g., For training data, set it to 1 (default) so that you
                will get the signal after each epoch of training. Set to -1
                to repeat the dataset indefinitely.

            "batch_size": int
                Batch size, i.e., the number of consecutive elements of the
                dataset to combine in a single batch.

            "allow_smaller_final_batch": bool
               Whether to allow the final batch to be smaller if there are
               insufficient elements left. If `False`, the final batch is
               discarded if it is smaller than batch size. Note that,
               if `True`, `output_shapes` of the resulting dataset
               will have a a **static** batch_size dimension equal to
               "batch_size".

            "shuffle": bool
                Whether to randomly shuffle the elements of the dataset.

            "shuffle_buffer_size": int
                The buffer size for data shuffling. The larger, the better
                the resulting data is mixed.

                If `None` (default), buffer size is set to the size of the
                whole dataset (i.e., make the shuffling the maximally
                effective).

            "shard_and_shuffle": bool
                Whether to first shard the dataset and then shuffle each
                block respectively. Useful when the whole data is too large to
                be loaded efficiently into the memory.

                If `True`, :attr:`shuffle_buffer_size` must be specified to
                determine the size of each shard.

            "num_parallel_calls": int
                Number of elements from the datasets to process in parallel.

            "prefetch_buffer_size": int
                The maximum number of elements that will be buffered when
                prefetching.

            max_dataset_size : int
                Maximum number of instances to include in
                the dataset. If set to `-1` or greater than the size of
                dataset, all instances will be included. This constraint is
                imposed after data shuffling and filtering.

            seed : int, optional
                The random seed for shuffle.

                Note that if a seed is set, the shuffle order will be exact
                the same every time when going through the (repeated) dataset.

                For example, consider a dataset with elements [1, 2, 3], with
                "num_epochs"`=2` and some fixed seed, the resulting sequence
                can be: 2 1 3, 1 3 2 | 2 1 3, 1 3 2, ... That is, the orders are
                different **within** every `num_epochs`, but are the same
                **across** the `num_epochs`.

            name : str
                Name of the data.
        """
        return {
            "name": "data",
            "num_epochs": 1,
            "batch_size": 64,
            "allow_smaller_final_batch": True,
            "shuffle": True,
            "shuffle_buffer_size": None,
            "shard_and_shuffle": False,
            "num_parallel_calls": 1,
            "prefetch_buffer_size": 0,
            "max_dataset_size": -1,
            "seed": None
        }

    @staticmethod
    def _make_batch(dataset, hparams, padded_batch=False, padding_values=None):
        dataset = dataset.repeat(hparams.num_epochs)
        batch_size = hparams["batch_size"]
        if hparams["allow_smaller_final_batch"]:
            if padded_batch:
                dataset = dataset.padded_batch(
                    batch_size, dataset.output_shapes,
                    padding_values=padding_values)
            else:
                dataset = dataset.batch(batch_size)
        else:
            dataset = dataset.apply(
                tf.contrib.data.padded_batch_and_drop_remainder(
                    batch_size, dataset.output_shapes,
                    padding_values=padding_values))
        return dataset

    @staticmethod
    def _shuffle_dataset(dataset, hparams, dataset_files):
        dataset_size = None
        shuffle_buffer_size = hparams["shuffle_buffer_size"]
        if hparams["shard_and_shuffle"]:
            if shuffle_buffer_size is None:
                raise ValueError(
                    "Dataset hyperparameter 'shuffle_buffer_size' "
                    "must not be `None` if 'shard_and_shuffle'=`True`.")
            dataset_size = count_file_lines(dataset_files)
            if shuffle_buffer_size >= dataset_size:
                raise ValueError(
                    "Dataset size (%d) <= shuffle_buffer_size (%d). Set "
                    "shuffle_and_shard to `False`." %
                    (dataset_size, shuffle_buffer_size))
            # TODO(zhiting): Use a different seed?
            dataset = dataset.apply(dsutils.random_shard_dataset(
                dataset_size, shuffle_buffer_size, hparams["seed"]))
            dataset = dataset.shuffle(shuffle_buffer_size + 16,  # add a margin
                                      seed=hparams["seed"])
        elif hparams["shuffle"]:
            if shuffle_buffer_size is None:
                dataset_size = count_file_lines(dataset_files)
                shuffle_buffer_size = dataset_size
            dataset = dataset.shuffle(shuffle_buffer_size, seed=hparams["seed"])

        return dataset, dataset_size

    @property
    def num_epochs(self):
        """Number of epochs.
        """
        return self._hparams.num_epochs

    @property
    def batch_size(self):
        """The batch size.
        """
        return self._hparams.batch_size

    @property
    def hparams(self):
        """A :class:`~texar.tf.HParams` instance of the
        data hyperparameters.
        """
        return self._hparams

    @property
    def name(self):
        """Name of the module.
        """
        return self._hparams.name