Source code for texar.tf.data.data.scalar_data

# Copyright 2018 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Various data classes that define data reading, parsing, batching, and other
preprocessing operations.
"""

import tensorflow as tf

from texar.tf.data.data_utils import count_file_lines
from texar.tf.data.data import dataset_utils as dsutils
from texar.tf.data.data.data_base import DataBase
from texar.tf.data.data.mono_text_data import MonoTextData
from texar.tf.data.data_decoders import ScalarDataDecoder

# pylint: disable=invalid-name, arguments-differ, not-context-manager

__all__ = [
    "_default_scalar_dataset_hparams",
    "ScalarData"
]


def _default_scalar_dataset_hparams():
    """Returns hyperparameters of a scalar dataset with default values.

    See :meth:`texar.tf.data.ScalarData.default_hparams` for details.
    """
    return {
        "files": [],
        "compression_type": None,
        "data_type": "int",
        "data_name": None,
        "other_transformations": [],
        "@no_typecheck": ["files"]
    }


[docs]class ScalarData(DataBase): """Scalar data where each line of the files is a scalar (int or float), e.g., a data label. Args: hparams (dict): Hyperparameters. See :meth:`default_hparams` for the defaults. The processor reads and processes raw data and results in a TF dataset whose element is a python `dict` including one field. The field name is specified in :attr:`hparams["dataset"]["data_name"]`. If not specified, the default name is `"data"`. The field name can be accessed through :attr:`data_name`. This field is a Tensor of shape `[batch_size]` containing a batch of scalars, of either int or float type as specified in :attr:`hparams`. Example: .. code-block:: python hparams={ 'dataset': { 'files': 'data.txt', 'data_name': 'label' }, 'batch_size': 2 } data = ScalarData(hparams) iterator = DataIterator(data) batch = iterator.get_next() iterator.switch_to_dataset(sess) # initializes the dataset batch_ = sess.run(batch) # batch_ == { # 'label': [2, 9] # } """ def __init__(self, hparams): DataBase.__init__(self, hparams) with tf.name_scope(self.name, self.default_hparams()["name"]): self._make_data()
[docs] @staticmethod def default_hparams(): """Returns a dicitionary of default hyperparameters. .. code-block:: python { # (1) Hyperparams specific to scalar dataset "dataset": { "files": [], "compression_type": None, "data_type": "int", "other_transformations": [], "data_name": None, } # (2) General hyperparams "num_epochs": 1, "batch_size": 64, "allow_smaller_final_batch": True, "shuffle": True, "shuffle_buffer_size": None, "shard_and_shuffle": False, "num_parallel_calls": 1, "prefetch_buffer_size": 0, "max_dataset_size": -1, "seed": None, "name": "scalar_data", } Here: 1. For the hyperparameters in the :attr:`"dataset"` field: "files": str or list A (list of) file path(s). Each line contains a single scalar number. "compression_type": str, optional One of "" (no compression), "ZLIB", or "GZIP". "data_type": str The scalar type. Currently supports "int" and "float". "other_transformations": list A list of transformation functions or function names/paths to further transform each single data instance. (More documentations to be added.) "data_name": str Name of the dataset. 2. For the **general** hyperparameters, see :meth:`texar.tf.data.DataBase.default_hparams` for details. """ hparams = DataBase.default_hparams() hparams["name"] = "scalar_data" hparams.update({ "dataset": _default_scalar_dataset_hparams() }) return hparams
@staticmethod def _get_dtype(dtype_hparam): if dtype_hparam == "int": dtype = tf.int32 elif dtype_hparam == "float": dtype = tf.float32 else: raise ValueError("Unknown data type: " + dtype_hparam) return dtype @staticmethod def _make_processor(dataset_hparams, data_spec, chained=True, name_prefix=None): # Create data decoder decoder = ScalarDataDecoder( ScalarData._get_dtype(dataset_hparams["data_type"]), data_name=name_prefix) # Create other transformations data_spec.add_spec(decoder=decoder) # pylint: disable=protected-access other_trans = MonoTextData._make_other_transformations( dataset_hparams["other_transformations"], data_spec) data_spec.add_spec(name_prefix=name_prefix) if chained: chained_tran = dsutils.make_chained_transformation( [decoder] + other_trans) return chained_tran, data_spec else: return decoder, other_trans, data_spec def _process_dataset(self, dataset, hparams, data_spec): chained_tran, data_spec = self._make_processor( hparams["dataset"], data_spec, name_prefix=hparams["dataset"]["data_name"]) num_parallel_calls = hparams["num_parallel_calls"] dataset = dataset.map( lambda *args: chained_tran(dsutils.maybe_tuple(args)), num_parallel_calls=num_parallel_calls) # Truncates data count dataset = dataset.take(hparams["max_dataset_size"]) return dataset, data_spec def _make_data(self): dataset_hparams = self._hparams.dataset # Create and shuffle dataset dataset = MonoTextData._make_mono_text_dataset(dataset_hparams) dataset, dataset_size = self._shuffle_dataset( dataset, self._hparams, self._hparams.dataset.files) self._dataset_size = dataset_size # Processing # pylint: disable=protected-access data_spec = dsutils._DataSpec(dataset=dataset, dataset_size=self._dataset_size) dataset, data_spec = self._process_dataset(dataset, self._hparams, data_spec) self._data_spec = data_spec self._decoder = data_spec.decoder # pylint: disable=no-member # Batching dataset = self._make_batch(dataset, self._hparams) # Prefetching if self._hparams.prefetch_buffer_size > 0: dataset = dataset.prefetch(self._hparams.prefetch_buffer_size) self._dataset = dataset
[docs] def list_items(self): """Returns the list of item names that the data can produce. Returns: A list of strings. """ return list(self._dataset.output_types.keys())
@property def dataset(self): """The dataset. """ return self._dataset
[docs] def dataset_size(self): """Returns the number of data instances in the dataset. Note that this is the total data count in the raw files, before any filtering and truncation. """ if not self._dataset_size: # pylint: disable=attribute-defined-outside-init self._dataset_size = count_file_lines( self._hparams.dataset.files) return self._dataset_size
@property def data_name(self): """The name of the data tensor, "data" by default if not specified in :attr:`hparams`. """ return self._decoder.data_tensor_name