Source code for nltk.align

# Natural Language Toolkit: Aligned Sentences
#
# Copyright (C) 2001-2013 NLTK Project
# Author: Will Zhang <wilzzha@gmail.com>
#         Guan Gui <ggui@student.unimelb.edu.au>
#         Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import print_function, unicode_literals
import logging
from collections import defaultdict

from nltk import compat
from nltk.metrics import precision, recall

@compat.python_2_unicode_compatible
[docs]class AlignedSent(object):
    """
    Return an aligned sentence object, which encapsulates two sentences
    along with an ``Alignment`` between them.

        >>> from nltk.align import AlignedSent
        >>> algnsent = AlignedSent(['klein', 'ist', 'das', 'Haus'],
        ...     ['the', 'house', 'is', 'small'], '0-2 1-3 2-1 3-0')
        >>> algnsent.words
        ['klein', 'ist', 'das', 'Haus']
        >>> algnsent.mots
        ['the', 'house', 'is', 'small']
        >>> algnsent.alignment
        Alignment([(0, 2), (1, 3), (2, 1), (3, 0)])
        >>> algnsent.precision('0-2 1-3 2-1 3-3')
        0.75
        >>> from nltk.corpus import comtrans
        >>> print(comtrans.aligned_sents()[54])
        <AlignedSent: 'Weshalb also sollten...' -> 'So why should EU arm...'>
        >>> print(comtrans.aligned_sents()[54].alignment)
        0-0 0-1 1-0 2-2 3-4 3-5 4-7 5-8 6-3 7-9 8-9 9-10 9-11 10-12 11-6 12-6 13-13

    :param words: source language words
    :type words: list(str)
    :param mots: target language words
    :type mots: list(str)
    :param alignment: the word-level alignments between the source
        and target language
    :type alignment: Alignment
    """

    def __init__(self, words=[], mots=[], alignment='', encoding='utf8'):
        self._words = words
        self._mots = mots
        self.alignment = alignment

    @property
[docs]    def words(self):
        return self._words

    @property
[docs]    def mots(self):
        return self._mots

    def _get_alignment(self):
        return self._alignment
    def _set_alignment(self, alignment):
        if not isinstance(alignment, Alignment):
            alignment = Alignment(alignment)
        self._check_align(alignment)
        self._alignment = alignment
    alignment = property(_get_alignment, _set_alignment)

    def _check_align(self, a):
        """
        Check whether the alignments are legal.

        :param a: alignment to be checked
        :raise IndexError: if alignment is out of sentence boundary
        :rtype: boolean
        """
        if not all(0 <= p[0] < len(self._words) for p in a):
            raise IndexError("Alignment is outside boundary of words")
        if not all(0 <= p[1] < len(self._mots) for p in a):
            raise IndexError("Alignment is outside boundary of mots")
        return True

    def __repr__(self):
        """
        Return a string representation for this ``AlignedSent``.

        :rtype: str
        """
        words = "[%s]" % (", ".join("'%s'" % w for w in self._words))
        mots = "[%s]" % (", ".join("'%s'" % w for w in self._mots))

        return "AlignedSent(%s, %s, %r)" % (words, mots, self._alignment)

    def __str__(self):
        """
        Return a human-readable string representation for this ``AlignedSent``.

        :rtype: str
        """
        source = " ".join(self._words)[:20] + "..."
        target = " ".join(self._mots)[:20] + "..."
        return "<AlignedSent: '%s' -> '%s'>" % (source, target)

[docs]    def invert(self):
        """
        Return the aligned sentence pair, reversing the directionality

        :rtype: AlignedSent
        """
        return AlignedSent(self._mots, self._words,
                               self._alignment.invert())

[docs]    def precision(self, reference):
        """
        Return the precision of an aligned sentence with respect to a
        "gold standard" reference ``AlignedSent``.

        :type reference: AlignedSent or Alignment
        :param reference: A "gold standard" reference aligned sentence.
        :rtype: float or None
        """
        # Get alignments in set of 2-tuples form
        # The "possible" precision is used since it doesn't penalize for finding
        # an alignment that was marked as "possible" (NAACL corpus)

        align = self.alignment
        if isinstance(reference, AlignedSent):
            possible = reference.alignment
        else:
            possible = Alignment(reference)

        return precision(possible, align)


[docs]    def recall(self, reference):
        """
        Return the recall of an aligned sentence with respect to a
        "gold standard" reference ``AlignedSent``.

        :type reference: AlignedSent or Alignment
        :param reference: A "gold standard" reference aligned sentence.
        :rtype: float or None
        """
        # Get alignments in set of 2-tuples form
        # The "sure" recall is used so we don't penalize for missing an
        # alignment that was only marked as "possible".

        align = self.alignment
        if isinstance(reference, AlignedSent):
            sure = reference.alignment
        else:
            sure  = Alignment(reference)

        # Call NLTKs existing functions for recall
        return recall(sure, align)


[docs]    def alignment_error_rate(self, reference, possible=None):
        """
        Return the Alignment Error Rate (AER) of an aligned sentence
        with respect to a "gold standard" reference ``AlignedSent``.

        Return an error rate between 0.0 (perfect alignment) and 1.0 (no
        alignment).

            >>> from nltk.align import AlignedSent
            >>> s = AlignedSent(["the", "cat"], ["le", "chat"], [(0, 0), (1, 1)])
            >>> s.alignment_error_rate(s)
            0.0

        :type reference: AlignedSent or Alignment
        :param reference: A "gold standard" reference aligned sentence.
        :type possible: AlignedSent or Alignment or None
        :param possible: A "gold standard" reference of possible alignments
            (defaults to *reference* if None)
        :rtype: float or None
        """
        # Get alignments in set of 2-tuples form
        align = self.alignment
        if isinstance(reference, AlignedSent):
            sure = reference.alignment
        else:
            sure = Alignment(reference)

        if possible is not None:
            # Set possible alignment
            if isinstance(possible, AlignedSent):
                possible = possible.alignment
            else:
                possible = Alignment(possible)
        else:
            # Possible alignment is just sure alignment
            possible = sure

        # Sanity check
        assert(sure.issubset(possible))

        # Return the Alignment Error Rate
        return (1.0 - float(len(align & sure) + len(align & possible)) /
                float(len(align) + len(sure)))


@compat.python_2_unicode_compatible
[docs]class Alignment(frozenset):
    """
    A storage class for representing alignment between two sequences, s1, s2.
    In general, an alignment is a set of tuples of the form (i, j, ...)
    representing an alignment between the i-th element of s1 and the
    j-th element of s2.  Tuples are extensible (they might contain
    additional data, such as a boolean to indicate sure vs possible alignments).

        >>> from nltk.align import Alignment
        >>> a = Alignment([(0, 0), (0, 1), (1, 2), (2, 2)])
        >>> a.invert()
        Alignment([(0, 0), (1, 0), (2, 1), (2, 2)])
        >>> print(a.invert())
        0-0 1-0 2-1 2-2
        >>> a[0]
        [(0, 1), (0, 0)]
        >>> a.invert()[2]
        [(2, 1), (2, 2)]
        >>> b = Alignment([(0, 0), (0, 1)])
        >>> b.issubset(a)
        True
        >>> c = Alignment('0-0 0-1')
        >>> b == c
        True
    """

    def __new__(cls, string_or_pairs):
        if isinstance(string_or_pairs, compat.string_types):
            string_or_pairs = [_giza2pair(p) for p in string_or_pairs.split()]
        self = frozenset.__new__(cls, string_or_pairs)
        self._len = (max(p[0] for p in self) if self != frozenset([]) else 0)
        self._index = None
        return self

    def __getitem__(self, key):
        """
        Look up the alignments that map from a given index or slice.
        """
        if not self._index:
            self._build_index()
        return self._index.__getitem__(key)

[docs]    def invert(self):
        """
        Return an Alignment object, being the inverted mapping.
        """
        return Alignment(((p[1], p[0]) + p[2:]) for p in self)

[docs]    def range(self, positions=None):
        """
        Work out the range of the mapping from the given positions.
        If no positions are specified, compute the range of the entire mapping.
        """
        image = set()
        if not self._index:
            self._build_index()
        if not positions:
            positions = list(range(len(self._index)))
        for p in positions:
            image.update(f for _,f in self._index[p])
        return sorted(image)

    def __repr__(self):
        """
        Produce a Giza-formatted string representing the alignment.
        """
        return "Alignment(%r)" % sorted(self)

    def __str__(self):
        """
        Produce a Giza-formatted string representing the alignment.
        """
        return " ".join("%d-%d" % p[:2] for p in sorted(self))

    def _build_index(self):
        """
        Build a list self._index such that self._index[i] is a list
        of the alignments originating from word i.
        """
        self._index = [[] for _ in range(self._len + 1)]
        for p in self:
            self._index[p[0]].append(p)


[docs]class IBMModel1(object):
    """
    This class implements the Expectation Maximization algorithm for
    IBM Model 1. The algorithm runs upon a sentence-aligned parallel
    corpus and generates word alignments in aligned sentence pairs.
    The process is divided into 2 stages:

    - Stage 1: Calculates word-to-word translation probabilities by collecting
      evidence of a English word being the translation of a foreign word from
      the parallel corpus.
    - Stage 2: Generates updated word alignments for the sentence pairs, based
      on the translation probabilities from Stage 1.

        >>> corpus = [AlignedSent(['the', 'house'], ['das', 'Haus']),
        ...           AlignedSent(['the', 'book'], ['das', 'Buch']),
        ...           AlignedSent(['a', 'book'], ['ein', 'Buch'])]
        >>> ibm1 = IBMModel1(corpus)
        >>> print("%.1f" % ibm1.probabilities['book', 'Buch'])
        1.0
        >>> print("%.1f" % ibm1.probabilities['book', 'das'])
        0.0
        >>> print("%.1f" % ibm1.probabilities['book', None])
        0.5

    :param aligned_sents: The parallel text ``corpus.Iterable`` containing
        AlignedSent instances of aligned sentence pairs from the corpus.
    :type aligned_sents: list(AlignedSent)
    :param convergent_threshold: The threshold value of convergence. An
        entry is considered converged if the delta from ``old_t`` to ``new_t``
        is less than this value. The algorithm terminates when all entries
        are converged. This parameter is optional, default is 0.01
    :type convergent_threshold: float
    """

    def __init__(self, aligned_sents, convergent_threshold=1e-2, debug=False):
        self.aligned_sents = aligned_sents
        self.convergent_threshold = convergent_threshold
        # Dictionary of translation probabilities t(e,f).
        self.probabilities = None
        self._train()

    def _train(self):
        """
        Perform Expectation Maximization training to learn
        word-to-word translation probabilities.
        """
        logging.debug("Starting training")

        # Collect up sets of all English and foreign words
        english_words = set()
        foreign_words = set()
        for aligned_sent in self.aligned_sents:
            english_words.update(aligned_sent.words)
            foreign_words.update(aligned_sent.mots)
        # add the NULL token to the foreign word set.
        foreign_words.add(None)
        num_probs = len(english_words) * len(foreign_words)

        # Initialise t(e|f) uniformly
        default_prob = 1.0 / len(english_words)
        t = defaultdict(lambda: default_prob)

        convergent_threshold = self.convergent_threshold
        globally_converged = False
        iteration_count = 0
        while not globally_converged:
            # count(e|f)
            count = defaultdict(float)
            # total(f)
            total = defaultdict(float)

            for aligned_sent in self.aligned_sents:
                s_total = {}
                # Compute normalization
                for e_w in aligned_sent.words:
                    s_total[e_w] = 0.0
                    for f_w in aligned_sent.mots+[None]:
                        s_total[e_w] += t[e_w, f_w]

                # Collect counts
                for e_w in aligned_sent.words:
                    for f_w in aligned_sent.mots+[None]:
                        cnt = t[e_w, f_w] / s_total[e_w]
                        count[e_w, f_w] += cnt
                        total[f_w] += cnt

            # Estimate probabilities
            num_converged = 0
            for f_w in foreign_words:
                for e_w in english_words:
                    new_prob = count[e_w, f_w] / total[f_w]
                    delta = abs(t[e_w, f_w] - new_prob)
                    if delta < convergent_threshold:
                        num_converged += 1
                    t[e_w, f_w] = new_prob

            # Have we converged
            iteration_count += 1
            if num_converged == num_probs:
                globally_converged = True
            logging.debug("%d/%d (%.2f%%) converged" %
                          (num_converged, num_probs, 100.0*num_converged/num_probs))

        self.probabilities = dict(t)

[docs]    def aligned(self):
        """
        Return a list of AlignedSents with Alignments calculated using
        IBM-Model 1.
        """

        if self.probabilities is None:
            raise ValueError("No probabilities calculated")

        aligned = []
        # Alignment Learning from t(e|f)
        for aligned_sent in self.aligned_sents:
            alignment = []

            # for every English word
            for j, e_w in enumerate(aligned_sent.words):
                # find the French word that gives maximized t(e|f)
                # NULL token is the initial candidate
                f_max = (self.probabilities[e_w, None], None)
                for i, f_w in enumerate(aligned_sent.mots):
                    f_max = max(f_max, (self.probabilities[e_w, f_w], i))

                # only output alignment with non-NULL mapping
                if f_max[1] is not None:
                    alignment.append((j, f_max[1]))

            # substitute the alignment of AlignedSent with the yielded one
            aligned.append(AlignedSent(aligned_sent.words,
                    aligned_sent.mots,  alignment))

        return aligned


def _giza2pair(pair_string):
    i, j = pair_string.split("-")
    return int(i), int(j)

def _naacl2pair(pair_string):
    i, j, p = pair_string.split("-")
    return int(i), int(j)

if __name__ == "__main__":
    import doctest
    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
Source code for nltk.align

Table Of Contents

Search