Source code for texar.tf.data.embedding

# Copyright 2018 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Helper functions and classes for embedding processing.
"""

import tensorflow as tf
from tensorflow import gfile
import numpy as np

from texar.tf.utils import utils
from texar.tf.hyperparams import HParams

__all__ = [
    "load_word2vec",
    "load_glove",
    "Embedding"
]


[docs]def load_word2vec(filename, vocab, word_vecs):
    """Loads embeddings in the word2vec binary format which has a header line
    containing the number of vectors and their dimensionality (two integers),
    followed with number-of-vectors lines each of which is formatted as
    '<word-string> <embedding-vector>'.

    Args:
        filename (str): Path to the embedding file.
        vocab (dict): A dictionary that maps token strings to integer index.
            Tokens not in :attr:`vocab` are not read.
        word_vecs: A 2D numpy array of shape `[vocab_size, embed_dim]`
            which is updated as reading from the file.

    Returns:
        The updated :attr:`word_vecs`.
    """
    with gfile.GFile(filename, "rb") as fin:
        header = fin.readline()
        vocab_size, vector_size = [int(s) for s in header.split()]
        if vector_size != word_vecs.shape[1]:
            raise ValueError("Inconsistent word vector sizes: %d vs %d" %
                             (vector_size, word_vecs.shape[1]))
        binary_len = np.dtype('float32').itemsize * vector_size
        for _ in np.arange(vocab_size):
            chars = []
            while True:
                char = fin.read(1)
                if char == b' ':
                    break
                if char != b'\n':
                    chars.append(char)
            word = b''.join(chars)
            word = tf.compat.as_text(word)
            if word in vocab:
                word_vecs[vocab[word]] = np.fromstring(
                    fin.read(binary_len), dtype='float32')
            else:
                fin.read(binary_len)
    return word_vecs


[docs]def load_glove(filename, vocab, word_vecs):
    """Loads embeddings in the glove text format in which each line is
    '<word-string> <embedding-vector>'. Dimensions of the embedding vector
    are separated with whitespace characters.

    Args:
        filename (str): Path to the embedding file.
        vocab (dict): A dictionary that maps token strings to integer index.
            Tokens not in :attr:`vocab` are not read.
        word_vecs: A 2D numpy array of shape `[vocab_size, embed_dim]`
            which is updated as reading from the file.

    Returns:
        The updated :attr:`word_vecs`.
    """
    with gfile.GFile(filename) as fin:
        for line in fin:
            vec = line.strip().split()
            if len(vec) == 0:
                continue
            word, vec = vec[0], vec[1:]
            word = tf.compat.as_text(word)
            if word not in vocab:
                continue
            if len(vec) != word_vecs.shape[1]:
                raise ValueError("Inconsistent word vector sizes: %d vs %d" %
                                 (len(vec), word_vecs.shape[1]))
            word_vecs[vocab[word]] = np.array([float(v) for v in vec])
    return word_vecs


[docs]class Embedding(object):
    """Embedding class that loads token embedding vectors from file. Token
    embeddings not in the embedding file are initialized as specified in
    :attr:`hparams`.

    Args:
        vocab (dict): A dictionary that maps token strings to integer index.
        read_fn: Callable that takes `(filename, vocab, word_vecs)` and
            returns the updated `word_vecs`. E.g.,
            :func:`~texar.tf.data.embedding.load_word2vec` and
            :func:`~texar.tf.data.embedding.load_glove`.
    """
    def __init__(self, vocab, hparams=None):
        self._hparams = HParams(hparams, self.default_hparams())

        # Initialize embeddings
        init_fn_kwargs = self._hparams.init_fn.kwargs.todict()
        if "shape" in init_fn_kwargs or "size" in init_fn_kwargs:
            raise ValueError("Argument 'shape' or 'size' must not be "
                             "specified. They are inferred automatically.")
        init_fn = utils.get_function(
            self._hparams.init_fn.type,
            ["numpy.random", "numpy", "texar.tf.custom"])

        try:
            self._word_vecs = init_fn(size=[len(vocab), self._hparams.dim],
                                      **init_fn_kwargs)
        except TypeError:
            self._word_vecs = init_fn(shape=[len(vocab), self._hparams.dim],
                                      **init_fn_kwargs)

        # Optionally read embeddings from file
        if self._hparams.file is not None and self._hparams.file != "":
            read_fn = utils.get_function(
                self._hparams.read_fn,
                ["texar.tf.data.embedding", "texar.tf.data", "texar.tf.custom"])

            self._word_vecs = \
                read_fn(self._hparams.file, vocab, self._word_vecs)

[docs]    @staticmethod
    def default_hparams():
        """Returns a dictionary of hyperparameters with default values:

        .. role:: python(code)
           :language: python

        .. code-block:: python

            {
                "file": "",
                "dim": 50,
                "read_fn": "load_word2vec",
                "init_fn": {
                    "type": "numpy.random.uniform",
                    "kwargs": {
                        "low": -0.1,
                        "high": 0.1,
                    }
                },
            }

        Here:

        "file": str
            Path to the embedding file. If not provided, all embeddings are
            initialized with the initialization function.

        "dim": int
            Dimension size of each embedding vector

        "read_fn": str or callable
            Function to read the embedding file. This can be the function,
            or its string name or full module path. E.g.,

            .. code-block:: python

                "read_fn": texar.tf.data.load_word2vec
                "read_fn": "load_word2vec"
                "read_fn": "texar.tf.data.load_word2vec"
                "read_fn": "my_module.my_read_fn"

            If function string name is used, the function must be in
            one of the modules: :mod:`texar.tf.data` or :mod:`texar.tf.custom`.

            The function must have the same signature as with
            :func:`load_word2vec`.

        "init_fn": dict
            Hyperparameters of the initialization function used to initialize
            embedding of tokens missing in the embedding
            file.

            The function must accept argument named `size` or `shape` to
            specify the output shape, and return a numpy array of the shape.

            The `dict` has the following fields:

                "type": str or callable
                    The initialization function. Can be either the function,
                    or its string name or full module path.

                "kwargs": dict
                    Keyword arguments for calling the function. The function
                    is called with :python:`init_fn(size=[.., ..], **kwargs)`.
        """
        return {
            "file": "",
            "dim": 50,
            "read_fn": "load_word2vec",
            "init_fn": {
                "type": "numpy.random.uniform",
                "kwargs": {
                    "low": -0.1,
                    "high": 0.1,
                },
            },
            "@no_typecheck": ["read_fn", "init_fn"]
        }

    @property
    def word_vecs(self):
        """2D numpy array of shape `[vocab_size, embedding_dim]`.
        """
        return self._word_vecs

    @property
    def vector_size(self):
        """The embedding dimention size.
        """
        return self._hparams.dim