Source code for texar.tf.data.vocabulary

# -*- coding: utf-8 -*-
# Copyright 2018 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Helper functions and classes for vocabulary processing.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import warnings
from collections import defaultdict

import tensorflow as tf
from tensorflow import gfile
import numpy as np

from texar.tf.utils.utils import dict_lookup

# pylint: disable=too-few-public-methods, invalid-name
# pylint: disable=too-many-instance-attributes, too-many-arguments

__all__ = [
    "SpecialTokens",
    "Vocab"
]


[docs]class SpecialTokens(object): """Special tokens, including :attr:`PAD`, :attr:`BOS`, :attr:`EOS`, :attr:`UNK`. These tokens will by default have token ids 0, 1, 2, 3, respectively. """ PAD = "<PAD>" BOS = "<BOS>" EOS = "<EOS>" UNK = "<UNK>"
def _make_defaultdict(keys, values, default_value): """Creates a python defaultdict. Args: keys (list): Keys of the dictionary. values (list): Values correspond to keys. The two lists :attr:`keys` and :attr:`values` must be of the same length. default_value: default value returned when key is missing. Returns: defaultdict: A python `defaultdict` instance that maps keys to values. """ dict_ = defaultdict(lambda: default_value) for k, v in zip(keys, values): dict_[k] = v return dict_
[docs]class Vocab(object): """Vocabulary class that loads vocabulary from file, and maintains mapping tables between token strings and indexes. Each line of the vocab file should contains one vocabulary token, e.g.,:: vocab_token_1 vocab token 2 vocab token | 3 . ... Args: filename (str): Path to the vocabulary file where each line contains one token. bos_token (str): A special token that will be added to the beginning of sequences. eos_token (str): A special token that will be added to the end of sequences. unk_token (str): A special token that will replace all unknown tokens (tokens not included in the vocabulary). pad_token (str): A special token that is used to do padding. """ def __init__(self, filename, pad_token=SpecialTokens.PAD, bos_token=SpecialTokens.BOS, eos_token=SpecialTokens.EOS, unk_token=SpecialTokens.UNK): self._filename = filename self._pad_token = pad_token self._bos_token = bos_token self._eos_token = eos_token self._unk_token = unk_token self._id_to_token_map, self._token_to_id_map, \ self._id_to_token_map_py, self._token_to_id_map_py = \ self.load(self._filename)
[docs] def load(self, filename): """Loads the vocabulary from the file. Args: filename (str): Path to the vocabulary file. Returns: A tuple of TF and python mapping tables between word string and index, (:attr:`id_to_token_map`, :attr:`token_to_id_map`, :attr:`id_to_token_map_py`, :attr:`token_to_id_map_py`), where :attr:`id_to_token_map` and :attr:`token_to_id_map` are TF :tf_main:`HashTable <contrib/lookup/HashTable>` instances, and :attr:`id_to_token_map_py` and :attr:`token_to_id_map_py` are python `defaultdict` instances. """ with gfile.GFile(filename) as vocab_file: # Converts to 'unicode' (Python 2) or 'str' (Python 3) vocab = list(tf.compat.as_text(line.strip()) for line in vocab_file) warnings.simplefilter("ignore", UnicodeWarning) if self._bos_token in vocab: raise ValueError("Special begin-of-seq token already exists in the " "vocabulary: '%s'" % self._bos_token) if self._eos_token in vocab: raise ValueError("Special end-of-seq token already exists in the " "vocabulary: '%s'" % self._eos_token) if self._unk_token in vocab: raise ValueError("Special UNK token already exists in the " "vocabulary: '%s'" % self._unk_token) if self._pad_token in vocab: raise ValueError("Special padding token already exists in the " "vocabulary: '%s'" % self._pad_token) warnings.simplefilter("default", UnicodeWarning) # Places _pad_token at the beginning to make sure it take index 0. vocab = [self._pad_token, self._bos_token, self._eos_token, self._unk_token] + vocab # Must make sure this is consistent with the above line unk_token_idx = 3 vocab_size = len(vocab) vocab_idx = np.arange(vocab_size) # Creates TF maps id_to_token_map = tf.contrib.lookup.HashTable( tf.contrib.lookup.KeyValueTensorInitializer( vocab_idx, vocab, key_dtype=tf.int64, value_dtype=tf.string), self._unk_token) token_to_id_map = tf.contrib.lookup.HashTable( tf.contrib.lookup.KeyValueTensorInitializer( vocab, vocab_idx, key_dtype=tf.string, value_dtype=tf.int64), unk_token_idx) # Creates python maps to interface with python code id_to_token_map_py = _make_defaultdict( vocab_idx, vocab, self._unk_token) token_to_id_map_py = _make_defaultdict( vocab, vocab_idx, unk_token_idx) return id_to_token_map, token_to_id_map, \ id_to_token_map_py, token_to_id_map_py
[docs] def map_ids_to_tokens(self, ids): """Maps ids into text tokens. The returned tokens are a Tensor. Args: ids: An `int` tensor of token ids. Returns: A tensor of text tokens of the same shape. """ return self.id_to_token_map.lookup(tf.cast(ids, tf.int64))
[docs] def map_tokens_to_ids(self, tokens): """Maps text tokens into ids. The returned ids are a Tensor. Args: tokens: An tensor of text tokens. Returns: A tensor of token ids of the same shape. """ return self.token_to_id_map.lookup(tokens)
[docs] def map_ids_to_tokens_py(self, ids): """Maps ids into text tokens. The input :attr:`ids` and returned tokens are both python arrays or list. Args: ids: An `int` numpy arry or (possibly nested) list of token ids. Returns: A numpy array of text tokens of the same shape as :attr:`ids`. """ return dict_lookup(self.id_to_token_map_py, ids, self.unk_token)
[docs] def map_tokens_to_ids_py(self, tokens): """Maps text tokens into ids. The input :attr:`tokens` and returned ids are both python arrays or list. Args: tokens: A numpy array or (possibly nested) list of text tokens. Returns: A numpy array of token ids of the same shape as :attr:`tokens`. """ return dict_lookup(self.token_to_id_map_py, tokens, self.unk_token_id)
@property def id_to_token_map(self): """The :tf_main:`HashTable <contrib/lookup/HashTable>` instance that maps from token index to the string form. """ return self._id_to_token_map @property def token_to_id_map(self): """The :tf_main:`HashTable <contrib/lookup/HashTable>` instance that maps from token string to the index. """ return self._token_to_id_map @property def id_to_token_map_py(self): """The python `defaultdict` instance that maps from token index to the string form. """ return self._id_to_token_map_py @property def token_to_id_map_py(self): """The python `defaultdict` instance that maps from token string to the index. """ return self._token_to_id_map_py @property def size(self): """The vocabulary size. """ return len(self.token_to_id_map_py) @property def bos_token(self): """A string of the special token indicating the beginning of sequence. """ return self._bos_token @property def bos_token_id(self): """The `int` index of the special token indicating the beginning of sequence. """ return self.token_to_id_map_py[self._bos_token] @property def eos_token(self): """A string of the special token indicating the end of sequence. """ return self._eos_token @property def eos_token_id(self): """The `int` index of the special token indicating the end of sequence. """ return self.token_to_id_map_py[self._eos_token] @property def unk_token(self): """A string of the special token indicating unknown token. """ return self._unk_token @property def unk_token_id(self): """The `int` index of the special token indicating unknown token. """ return self.token_to_id_map_py[self._unk_token] @property def pad_token(self): """A string of the special token indicating padding token. The default padding token is an empty string. """ return self._pad_token @property def pad_token_id(self): """The `int` index of the special token indicating padding token. """ return self.token_to_id_map_py[self._pad_token] @property def special_tokens(self): """The list of special tokens [:attr:`pad_token`, :attr:`bos_token`, :attr:`eos_token`, :attr:`unk_token`]. """ return [self._pad_token, self._bos_token, self._eos_token, self._unk_token]