Source code for texar.tf.evals.bleu

# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modifications copyright (C) 2018 Texar
# ==============================================================================
"""
Python implementation of BLEU and smoothed BLEU adapted from:
    `https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py`

This module provides a Python implementation of BLEU and smoothed BLEU.
Smooth BLEU is computed following the method outlined in the paper:

    (Lin et al. 2004) ORANGE: a method for evaluating automatic evaluation
    metrics for maching translation.
    Chin-Yew Lin, Franz Josef Och. COLING 2004.
"""

import collections
import math

from texar.tf.utils.dtypes import compat_as_text, is_str

# pylint: disable=invalid-name, too-many-branches, too-many-locals
# pylint: disable=too-many-arguments

__all__ = [
    "sentence_bleu",
    "corpus_bleu"
]


def _get_ngrams(segment, max_order):
    """Extracts all n-grams up to a given maximum order from an input segment.

    Args:
        segment: text segment from which n-grams will be extracted.
        max_order: maximum length in tokens of the n-grams returned by this
            methods.

    Returns:
        The Counter containing all n-grams upto max_order in segment
        with a count of how many times each n-gram occurred.
    """
    ngram_counts = collections.Counter()
    for order in range(1, max_order + 1):
        for i in range(0, len(segment) - order + 1):
            ngram = tuple(segment[i:i + order])
            ngram_counts[ngram] += 1
    return ngram_counts


def _maybe_str_to_list(list_or_str):
    if is_str(list_or_str):
        return list_or_str.split()
    return list_or_str


def _lowercase(str_list):
    return [str_.lower() for str_ in str_list]


[docs]def sentence_bleu(references, hypothesis, max_order=4, lowercase=False, smooth=False, return_all=False): """Calculates BLEU score of a hypothesis sentence. Args: references: A list of reference for the hypothesis. Each reference can be either a list of string tokens, or a string containing tokenized tokens separated with whitespaces. List can also be numpy array. hypotheses: A hypothesis sentence. Each hypothesis can be either a list of string tokens, or a string containing tokenized tokens separated with whitespaces. List can also be numpy array. lowercase (bool): If `True`, lowercase reference and hypothesis tokens. max_order (int): Maximum n-gram order to use when computing BLEU score. smooth (bool): Whether or not to apply (Lin et al. 2004) smoothing. return_all (bool): If `True`, returns BLEU and all n-gram precisions. Returns: If :attr:`return_all` is `False` (default), returns a float32 BLEU score. If :attr:`return_all` is `True`, returns a list of float32 scores: `[BLEU] + n-gram precisions`, which is of length :attr:`max_order` + 1. """ return corpus_bleu( [references], [hypothesis], max_order=max_order, lowercase=lowercase, smooth=smooth, return_all=return_all)
[docs]def corpus_bleu(list_of_references, hypotheses, max_order=4, lowercase=False, smooth=False, return_all=True): """Computes corpus-level BLEU score. Args: list_of_references: A list of lists of references for each hypothesis. Each reference can be either a list of string tokens, or a string containing tokenized tokens separated with whitespaces. List can also be numpy array. hypotheses: A list of hypothesis sentences. Each hypothesis can be either a list of string tokens, or a string containing tokenized tokens separated with whitespaces. List can also be numpy array. lowercase (bool): If `True`, lowercase reference and hypothesis tokens. max_order (int): Maximum n-gram order to use when computing BLEU score. smooth (bool): Whether or not to apply (Lin et al. 2004) smoothing. return_all (bool): If `True`, returns BLEU and all n-gram precisions. Returns: If :attr:`return_all` is `False` (default), returns a float32 BLEU score. If :attr:`return_all` is `True`, returns a list of float32 scores: `[BLEU] + n-gram precisions`, which is of length :attr:`max_order` + 1. """ list_of_references = compat_as_text(list_of_references) hypotheses = compat_as_text(hypotheses) matches_by_order = [0] * max_order possible_matches_by_order = [0] * max_order reference_length = 0 hyperthsis_length = 0 for (references, hyperthsis) in zip(list_of_references, hypotheses): reference_length += min(len(r) for r in references) hyperthsis_length += len(hyperthsis) merged_ref_ngram_counts = collections.Counter() for reference in references: reference = _maybe_str_to_list(reference) if lowercase: reference = _lowercase(reference) merged_ref_ngram_counts |= _get_ngrams(reference, max_order) hyperthsis = _maybe_str_to_list(hyperthsis) if lowercase: hyperthsis = _lowercase(hyperthsis) hyperthsis_ngram_counts = _get_ngrams(hyperthsis, max_order) overlap = hyperthsis_ngram_counts & merged_ref_ngram_counts for ngram in overlap: matches_by_order[len(ngram) - 1] += overlap[ngram] for order in range(1, max_order + 1): possible_matches = len(hyperthsis) - order + 1 if possible_matches > 0: possible_matches_by_order[order - 1] += possible_matches precisions = [0] * max_order for i in range(0, max_order): if smooth: precisions[i] = ((matches_by_order[i] + 1.) / (possible_matches_by_order[i] + 1.)) else: if possible_matches_by_order[i] > 0: precisions[i] = (float(matches_by_order[i]) / possible_matches_by_order[i]) else: precisions[i] = 0.0 if min(precisions) > 0: p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions) geo_mean = math.exp(p_log_sum) else: geo_mean = 0 ratio = float(hyperthsis_length) / reference_length if ratio > 1.0: bp = 1. else: try: bp = math.exp(1 - 1. / ratio) except ZeroDivisionError: bp = math.exp(1 - 1. / (ratio + 1e-8)) bleu = geo_mean * bp if return_all: return [bleu * 100] + [p * 100 for p in precisions] else: return bleu * 100