Source code for nltk.align.bleu

# -*- coding: utf-8 -*-
# Natural Language Toolkit: BLEU
#
# Copyright (C) 2001-2013 NLTK Project
# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

from __future__ import division

import math

from nltk import word_tokenize
from nltk.compat import Counter
from nltk.util import ngrams


[docs]class BLEU(object):
    """
    This class implements the BLEU method, which is used to evaluate
    the quality of machine translation. [1]

    Consider an example:

    >>> weights = [0.25, 0.25, 0.25, 0.25]
    >>> candidate1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
    ...               'ensures', 'that', 'the', 'military', 'always',
    ...               'obeys', 'the', 'commands', 'of', 'the', 'party']

    >>> candidate2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
    ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
    ...               'that', 'party', 'direct']

    >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
    ...               'ensures', 'that', 'the', 'military', 'will', 'forever',
    ...               'heed', 'Party', 'commands']

    >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
    ...               'guarantees', 'the', 'military', 'forces', 'always',
    ...               'being', 'under', 'the', 'command', 'of', 'the',
    ...               'Party']

    >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
    ...               'army', 'always', 'to', 'heed', 'the', 'directions',
    ...               'of', 'the', 'party']

    The BLEU method mainly consists of two parts:

    Part 1 - modified n-gram precision

    The normal precision method may lead to some wrong translations with
    high-precision, e.g., the translation, in which a word of reference
    repeats several times, has very high precision. So in the modified
    n-gram precision, a reference word will be considered exhausted after
    a matching candidate word is identified.

    Unigrams:

    >>> BLEU.modified_precision(
    ...    candidate1,
    ...    [reference1, reference2, reference3],
    ...    n=1,
    ... )
    0.94...

    >>> BLEU.modified_precision(
    ...    candidate2,
    ...    [reference1, reference2, reference3],
    ...    n=1,
    ... )
    0.57...

    Bigrmas:

    >>> BLEU.modified_precision(
    ...    candidate1,
    ...    [reference1, reference2, reference3],
    ...    n=2,
    ... )
    0.58...

    >>> BLEU.modified_precision(
    ...    candidate2,
    ...    [reference1, reference2, reference3],
    ...    n=2,
    ... )
    0.07...


    Part 2 - brevity penalty

    As the modified n-gram precision still has the problem from the short
    length sentence, brevity penalty is used to modify the overall BLEU
    score according to length.

    >>> BLEU.compute(candidate1, [reference1, reference2, reference3], weights)
    0.504...

    >>> BLEU.compute(candidate2, [reference1, reference2, reference3], weights)
    0.457...

    2. Test with two corpus that one is a reference and another is
    an output from translation system:

    >>> weights = [0.25, 0.25, 0.25, 0.25]
    >>> ref_file = open('newstest2012-ref.en')  # doctest: +SKIP
    >>> candidate_file = open('newstest2012.fr-en.cmu-avenue')  # doctest: +SKIP

    >>> total = 0.0
    >>> count = 0

    >>> for candi_raw in candidate_file:  # doctest: +SKIP
    ...		ref_raw = ref_file.readline()
    ...		ref_tokens = word_tokenize(ref_raw)
    ...		candi_tokens = word_tokenize(candi_raw)
    ...		total = BLEU.compute(candi_tokens, [ref_tokens], weights)
    ...		count += 1

    >>> total / count  # doctest: +SKIP
    2.787504437460048e-05

    [1] Papineni, Kishore, et al. "BLEU: a method for automatic evaluation of
    machine translation." Proceedings of the 40th annual meeting on
    association for computational linguistics. Association for Computational
    Linguistics, 2002.

    """

    @staticmethod
[docs]    def compute(candidate, references, weights):
        candidate = [c.lower() for c in candidate]
        references = [[r.lower() for r in reference] for reference in references]

        p_ns = (BLEU.modified_precision(candidate, references, i) for i, _ in enumerate(weights, start=1))
        s = math.fsum(w * math.log(p_n) for w, p_n in zip(weights, p_ns) if p_n)

        bp = BLEU.brevity_penalty(candidate, references)
        return bp * math.exp(s)

    @staticmethod
[docs]    def modified_precision(candidate, references, n):
        """ Calculate modified ngram precision.

        >>> BLEU.modified_precision(
        ...    'the the the the the the the'.split(),
        ...    ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()],
        ...    n=1,
        ... )
        0.28...

        >>> BLEU.modified_precision(
        ...    'the the the the the the the'.split(),
        ...    ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()],
        ...    n=2,
        ... )
        0.0

        >>> BLEU.modified_precision(
        ...    'of the'.split(),
        ...    [
        ...        'It is a guide to action that ensures that the military will forever heed Party commands.'.split(),
        ...        'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(),
        ...        'It is the practical guide for the army always to heed the directions of the party'.split(),
        ...    ],
        ...    n=1,
        ... )
        1.0

        >>> BLEU.modified_precision(
        ...    'of the'.split(),
        ...    [
        ...        'It is a guide to action that ensures that the military will forever heed Party commands.'.split(),
        ...        'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(),
        ...        'It is the practical guide for the army always to heed the directions of the party'.split(),
        ...    ],
        ...    n=2,
        ... )
        1.0

        """
        counts = Counter(ngrams(candidate, n))

        if not counts:
            return 0

        max_counts = {}
        for reference in references:
            reference_counts = Counter(ngrams(reference, n))
            for ngram in counts:
                max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])

        clipped_counts = dict((ngram, min(count, max_counts[ngram])) for ngram, count in counts.items())

        return sum(clipped_counts.values()) / sum(counts.values())

    @staticmethod
[docs]    def brevity_penalty(candidate, references):
        c = len(candidate)
        r = min(abs(len(r) - c) for r in references)

        if c > r:
            return 1
        else:
            return math.exp(1 - r / c)

# run doctests
if __name__ == "__main__":
    import doctest
    doctest.testmod()
Source code for nltk.align.bleu

Table Of Contents

Search