Source code for texar.tf.utils.utils_io

# -*- coding: utf-8 -*-
# Copyright 2018 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Utility functions related to input/output.
"""

# pylint: disable=invalid-name, redefined-builtin, too-many-arguments

from io import open
import os
import importlib
import yaml

import tensorflow as tf

as_text = tf.compat.as_text

__all__ = [
    "load_config_single",
    "load_config",
    "write_paired_text",
    "maybe_create_dir",
    "get_files"
]

# def get_tf_logger(fname,
#                  verbosity=tf.logging.INFO,
#                  to_stdio=False,
#                  stdio_verbosity=None):
#    """Creates TF logger that allows to specify log filename and whether to
#    print to stdio at the same time.
#
#    Args:
#        fname (str): The log filename.
#        verbosity: The threshold for what messages will be logged. Default is
#           `INFO`. Other options include `DEBUG`, `ERROR`, `FATAL`, and `WARN`.
#            See :tf_main:`tf.logging <logging>`.
#        to_stdio (bool): Whether to print messages to stdio at the same time.
#        stido_verbosity (optional): The verbosity level when printing to stdio.
#            If `None` (default), the level is set to be the same as
#            :attr:`verbosity`. Ignored if :attr:`to_stdio` is False.
#
#    Returns:
#        The TF logger.
#    """


def _load_config_python(fname):
    config = {}

    config_module = importlib.import_module(fname.rstrip('.py'))
    for key in dir(config_module):
        if not (key.startswith('__') and key.endswith('__')):
            config[key] = getattr(config_module, key)

    return config


def _load_config_yaml(fname):
    with tf.gfile.GFile(fname) as config_file:
        config = yaml.load(config_file)
    return config


def load_config_single(fname, config=None):
    """Loads config from a single file.

    The config file can be either a Python file (with suffix '.py')
    or a YAML file. If the filename is not suffixed with '.py', the file is
    parsed as YAML.

    Args:
        fname (str): The config file name.
        config (dict, optional): A config dict to which new configurations are
            added. If `None`, a new config dict is created.

    Returns:
        A `dict` of configurations.
    """
    if fname.endswith('.py'):
        new_config = _load_config_python(fname)
    else:
        new_config = _load_config_yaml(fname)

    if config is None:
        config = new_config
    else:
        for key, value in new_config.items():
            if key in config:
                if isinstance(config[key], dict):
                    config[key].update(value)
                else:
                    config[key] = value
            else:
                config[key] = value

    return config


[docs]def load_config(config_path, config=None): """Loads configs from (possibly multiple) file(s). A config file can be either a Python file (with suffix '.py') or a YAML file. If the filename is not suffixed with '.py', the file is parsed as YAML. Args: config_path: Paths to configuration files. This can be a `list` of config file names, or a path to a directory in which all files are loaded, or a string of multiple file names separated by commas. config (dict, optional): A config dict to which new configurations are added. If `None`, a new config dict is created. Returns: A `dict` of configurations. """ fnames = [] if isinstance(config_path, (list, tuple)): fnames = list(config_path) elif tf.gfile.IsDirectory(config_path): for fname in tf.gfile.ListDirectory(config_path): fname = os.path.join(config_path, fname) if not tf.gfile.IsDirectory(fname): fnames.append(fname) else: for fname in config_path.split(","): fname = fname.strip() if not fname: continue fnames.append(fname) if config is None: config = {} for fname in fnames: config = load_config_single(fname, config) return config
# pylint: disable=too-many-locals
[docs]def write_paired_text(src, tgt, fname, append=False, mode='h', sep='\t', src_fname_suffix='src', tgt_fname_suffix='tgt'): """Writes paired text to a file. Args: src: A list (or array) of `str` source text. tgt: A list (or array) of `str` target text. fname (str): The output filename. append (bool): Whether append content to the end of the file if exists. mode (str): The mode of writing, with the following options: - **'h'**: The "horizontal" mode. Each source target pair is \ written in one line, intervened with :attr:`sep`, e.g.:: source_1 target_1 source_2 target_2 - **'v'**: The "vertical" mode. Each source target pair is \ written in two consecutive lines, e.g:: source_1 target_1 source_2 target_2 - **'s'**: The "separate" mode. Each source target pair is \ written in corresponding lines of two files named \ as `"{fname}.{src_fname_suffix}"` \ and `"{fname}.{tgt_fname_suffix}"`, respectively. sep (str): The string intervening between source and target. Used when :attr:`mode` is set to 'h'. src_fname_suffix (str): Used when :attr:`mode` is 's'. The suffix to the source output filename. E.g., with `(fname='output', src_fname_suffix='src')`, the output source file is named as `output.src`. tgt_fname_suffix (str): Used when :attr:`mode` is 's'. The suffix to the target output filename. Returns: The fileanme(s). If `mode` == 'h' or 'v', returns :attr:`fname`. If `mode` == 's', returns a list of filenames `["{fname}.src", "{fname}.tgt"]`. """ fmode = 'a' if append else 'w' if mode == 's': fn_src = '{}.{}'.format(fname, src_fname_suffix) fn_tgt = '{}.{}'.format(fname, tgt_fname_suffix) with open(fn_src, fmode, encoding='utf-8') as fs: fs.write(as_text('\n'.join(src))) fs.write('\n') with open(fn_tgt, fmode, encoding='utf-8') as ft: ft.write(as_text('\n'.join(tgt))) ft.write('\n') return fn_src, fn_tgt else: with open(fname, fmode, encoding='utf-8') as f: for s, t in zip(src, tgt): if mode == 'h': text = '{}{}{}\n'.format(as_text(s), sep, as_text(t)) f.write(as_text(text)) elif mode == 'v': text = '{}\n{}\n'.format(as_text(s), as_text(t)) f.write(as_text(text)) else: raise ValueError('Unknown mode: {}'.format(mode)) return fname
[docs]def maybe_create_dir(dirname): """Creates directory if doesn't exist """ if not tf.gfile.IsDirectory(dirname): tf.gfile.MakeDirs(dirname) return True return False
[docs]def get_files(file_paths): """Gets a list of file paths given possibly a pattern :attr:`file_paths`. Adapted from `tf.contrib.slim.data.parallel_reader.get_data_files`. Args: file_paths: A (list of) path to the files. The path can be a pattern, e.g., /path/to/train*, /path/to/train[12] Returns: A list of file paths. Raises: ValueError: If no files are not found """ if isinstance(file_paths, (list, tuple)): files = [] for f in file_paths: files += get_files(f) else: if '*' in file_paths or '?' in file_paths or '[' in file_paths: files = tf.gfile.Glob(file_paths) else: files = [file_paths] if not files: raise ValueError('No data files found in %s' % (file_paths,)) return files