Source code for texar.tf.evals.bleu_moses

# -*- coding: utf-8 -*-
# Copyright 2018 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The BLEU metric.
"""

import os
from io import open  # pylint: disable=redefined-builtin
import shutil
import re
import subprocess
import tempfile
import numpy as np

import tensorflow as tf

from texar.tf.utils.dtypes import compat_as_text

# pylint: disable=too-many-locals, no-member, redefined-variable-type

__all__ = [
    "sentence_bleu_moses",
    "corpus_bleu_moses"
]


def _maybe_list_to_str(list_or_str):
    if isinstance(list_or_str, (tuple, list, np.ndarray)):
        return ' '.join(list_or_str)
    return list_or_str


def _parse_multi_bleu_ret(bleu_str, return_all=False):
    bleu_score = re.search(r"BLEU = (.+?),", bleu_str).group(1)
    bleu_score = np.float32(bleu_score)

    if return_all:
        bleus = re.search(r", (.+?)/(.+?)/(.+?)/(.+?) ", bleu_str)
        bleus = [bleus.group(group_idx) for group_idx in range(1, 5)]
        bleus = [np.float32(b) for b in bleus]
        bleu_score = [bleu_score] + bleus

    return bleu_score


[docs]def sentence_bleu_moses(references, hypothesis, lowercase=False,
                        return_all=False):
    """Calculates BLEU score of a hypothesis sentence using the
    **MOSES multi-bleu.perl** script.

    Args:
        references: A list of reference for the hypothesis.
            Each reference can be either a string, or a list of string tokens.
            List can also be numpy array.
        hypotheses: A hypothesis sentence.
            The hypothesis can be either a string, or a list of string tokens.
            List can also be numpy array.
        lowercase (bool): If `True`, pass the "-lc" flag to the multi-bleu
            script.
        return_all (bool): If `True`, returns BLEU and all n-gram precisions.

    Returns:
        If :attr:`return_all` is `False` (default), returns a float32
        BLEU score.

        If :attr:`return_all` is `True`, returns a list of 5 float32 scores:
        `[BLEU, 1-gram precision, ..., 4-gram precision]`.
    """
    return corpus_bleu_moses(
        [references], [hypothesis], lowercase=lowercase, return_all=return_all)


[docs]def corpus_bleu_moses(list_of_references, hypotheses, lowercase=False,
                      return_all=False):
    """Calculates corpus-level BLEU score using the
    **MOSES multi-bleu.perl** script.

    Args:
        list_of_references: A list of lists of references for each hypothesis.
            Each reference can be either a string, or a list of string tokens.
            List can also be numpy array.
        hypotheses: A list of hypothesis sentences.
            Each hyperthsis can be either a string, or a list of string tokens.
            List can also be numpy array.
        lowercase (bool): If `True`, pass the "-lc" flag to the multi-bleu
            script.
        return_all (bool): If `True`, returns BLEU and all n-gram precisions.

    Returns:
        If :attr:`return_all` is `False` (default), returns a float32
        BLEU score.

        If :attr:`return_all` is `True`, returns a list of 5 float32 scores:
        `[BLEU, 1-gram precision, ..., 4-gram precision]`.
    """
    list_of_references = compat_as_text(list_of_references)
    hypotheses = compat_as_text(hypotheses)

    if np.size(hypotheses) == 0:
        return np.float32(0.)   # pylint: disable=no-member

    # Get multi-bleu.perl
    cur_dir = os.path.dirname(os.path.realpath(__file__))
    multi_bleu_path = os.path.abspath(
        os.path.join(cur_dir, "..", "..", "..", "bin", "utils",
                     "multi-bleu.perl"))

    # Create a temporary folder containing hyperthesis and reference files
    result_path = tempfile.mkdtemp()
    # Create hyperthesis file
    hfile_path = os.path.join(result_path, 'hyp')
    hyps = [_maybe_list_to_str(h) for h in hypotheses]
    with open(hfile_path, 'w', encoding='utf-8') as hfile:
        text = "\n".join(hyps)
        hfile.write(text)
        hfile.write("\n")
    # Create reference files
    max_nrefs = max([len(refs) for refs in list_of_references])
    rfile_path = os.path.join(result_path, 'ref')
    for rid in range(max_nrefs):
        with open(rfile_path + '%d' % rid, 'w', encoding='utf-8') as rfile:
            for refs in list_of_references:
                if rid < len(refs):
                    ref = _maybe_list_to_str(refs[rid])
                    rfile.write(ref + "\n")
                else:
                    rfile.write("\n")

    # Calculate BLEU
    multi_bleu_cmd = [multi_bleu_path]
    if lowercase:
        multi_bleu_cmd += ["-lc"]
    multi_bleu_cmd += [rfile_path]
    with open(hfile_path, "r") as hyp_input:
        try:
            multi_bleu_ret = subprocess.check_output(
                multi_bleu_cmd, stdin=hyp_input, stderr=subprocess.STDOUT)
            multi_bleu_ret = multi_bleu_ret.decode("utf-8")
            bleu_score = _parse_multi_bleu_ret(multi_bleu_ret, return_all)
        except subprocess.CalledProcessError as error:
            if error.output is not None:
                tf.logging.warning(
                    "multi-bleu.perl returned non-zero exit code")
                tf.logging.warning(error.output)
            if return_all:
                bleu_score = [np.float32(0.0)] * 5
            else:
                bleu_score = np.float32(0.0)

    shutil.rmtree(result_path)

    return np.float32(bleu_score)