Source code for nltk.align.bleu

# -*- coding: utf-8 -*-
# Natural Language Toolkit: BLEU
#
# Copyright (C) 2001-2013 NLTK Project
# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

from __future__ import division

import math

from nltk import word_tokenize
from nltk.compat import Counter
from nltk.util import ngrams


[docs]class BLEU(object): """ This class implements the BLEU method, which is used to evaluate the quality of machine translation. [1] Consider an example: >>> weights = [0.25, 0.25, 0.25, 0.25] >>> candidate1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', ... 'ensures', 'that', 'the', 'military', 'always', ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] >>> candidate2 = ['It', 'is', 'to', 'insure', 'the', 'troops', ... 'forever', 'hearing', 'the', 'activity', 'guidebook', ... 'that', 'party', 'direct'] >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', 'forever', ... 'heed', 'Party', 'commands'] >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', ... 'Party'] >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] The BLEU method mainly consists of two parts: Part 1 - modified n-gram precision The normal precision method may lead to some wrong translations with high-precision, e.g., the translation, in which a word of reference repeats several times, has very high precision. So in the modified n-gram precision, a reference word will be considered exhausted after a matching candidate word is identified. Unigrams: >>> BLEU.modified_precision( ... candidate1, ... [reference1, reference2, reference3], ... n=1, ... ) 0.94... >>> BLEU.modified_precision( ... candidate2, ... [reference1, reference2, reference3], ... n=1, ... ) 0.57... Bigrmas: >>> BLEU.modified_precision( ... candidate1, ... [reference1, reference2, reference3], ... n=2, ... ) 0.58... >>> BLEU.modified_precision( ... candidate2, ... [reference1, reference2, reference3], ... n=2, ... ) 0.07... Part 2 - brevity penalty As the modified n-gram precision still has the problem from the short length sentence, brevity penalty is used to modify the overall BLEU score according to length. >>> BLEU.compute(candidate1, [reference1, reference2, reference3], weights) 0.504... >>> BLEU.compute(candidate2, [reference1, reference2, reference3], weights) 0.457... 2. Test with two corpus that one is a reference and another is an output from translation system: >>> weights = [0.25, 0.25, 0.25, 0.25] >>> ref_file = open('newstest2012-ref.en') # doctest: +SKIP >>> candidate_file = open('newstest2012.fr-en.cmu-avenue') # doctest: +SKIP >>> total = 0.0 >>> count = 0 >>> for candi_raw in candidate_file: # doctest: +SKIP ... ref_raw = ref_file.readline() ... ref_tokens = word_tokenize(ref_raw) ... candi_tokens = word_tokenize(candi_raw) ... total = BLEU.compute(candi_tokens, [ref_tokens], weights) ... count += 1 >>> total / count # doctest: +SKIP 2.787504437460048e-05 [1] Papineni, Kishore, et al. "BLEU: a method for automatic evaluation of machine translation." Proceedings of the 40th annual meeting on association for computational linguistics. Association for Computational Linguistics, 2002. """ @staticmethod
[docs] def compute(candidate, references, weights): candidate = [c.lower() for c in candidate] references = [[r.lower() for r in reference] for reference in references] p_ns = (BLEU.modified_precision(candidate, references, i) for i, _ in enumerate(weights, start=1)) s = math.fsum(w * math.log(p_n) for w, p_n in zip(weights, p_ns) if p_n) bp = BLEU.brevity_penalty(candidate, references) return bp * math.exp(s)
@staticmethod
[docs] def modified_precision(candidate, references, n): """ Calculate modified ngram precision. >>> BLEU.modified_precision( ... 'the the the the the the the'.split(), ... ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()], ... n=1, ... ) 0.28... >>> BLEU.modified_precision( ... 'the the the the the the the'.split(), ... ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()], ... n=2, ... ) 0.0 >>> BLEU.modified_precision( ... 'of the'.split(), ... [ ... 'It is a guide to action that ensures that the military will forever heed Party commands.'.split(), ... 'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(), ... 'It is the practical guide for the army always to heed the directions of the party'.split(), ... ], ... n=1, ... ) 1.0 >>> BLEU.modified_precision( ... 'of the'.split(), ... [ ... 'It is a guide to action that ensures that the military will forever heed Party commands.'.split(), ... 'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(), ... 'It is the practical guide for the army always to heed the directions of the party'.split(), ... ], ... n=2, ... ) 1.0 """ counts = Counter(ngrams(candidate, n)) if not counts: return 0 max_counts = {} for reference in references: reference_counts = Counter(ngrams(reference, n)) for ngram in counts: max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram]) clipped_counts = dict((ngram, min(count, max_counts[ngram])) for ngram, count in counts.items()) return sum(clipped_counts.values()) / sum(counts.values())
@staticmethod
[docs] def brevity_penalty(candidate, references): c = len(candidate) r = min(abs(len(r) - c) for r in references) if c > r: return 1 else: return math.exp(1 - r / c) # run doctests
if __name__ == "__main__": import doctest doctest.testmod()