Source code for nltk.align.gale_church

# -*- coding: utf-8 -*-

# Natural Language Toolkit: Gale-Church Aligner
#
# Copyright (C) 2001-2013 NLTK Project
# Author: Torsten Marek <marek@ifi.uzh.ch>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

"""

A port of the Gale-Church Aligner.

Gale & Church (1993), A Program for Aligning Sentences in Bilingual Corpora.
http://aclweb.org/anthology/J93-1004.pdf

"""

from __future__ import division
import math

try:
    from scipy.stats import norm
    from norm import logsf as norm_logsf
except ImportError:
[docs]    def erfcc(x):
        """Complementary error function."""
        z = abs(x)
        t = 1 / (1 + 0.5 * z)
        r = t * math.exp(-z * z -
                         1.26551223 + t *
                         (1.00002368 + t *
                          (.37409196 + t *
                           (.09678418 + t *
                            (-.18628806 + t *
                             (.27886807 + t *
                              (-1.13520398 + t *
                               (1.48851587 + t *
                                (-.82215223 + t * .17087277)))))))))
        if x >= 0.:
            return r
        else:
            return 2. - r


[docs]    def norm_cdf(x):
        """Return the area under the normal distribution from M{-∞..x}."""
        return 1 - 0.5 * erfcc(x / math.sqrt(2))


[docs]    def norm_logsf(x):
        try:
            return math.log(1 - norm_cdf(x))
        except ValueError:
            return float('-inf')


LOG2 = math.log(2)


[docs]class LanguageIndependent(object):
    # These are the language-independent probabilities and parameters
    # given in Gale & Church

    # for the computation, l_1 is always the language with less characters
    PRIORS = {
        (1, 0): 0.0099,
        (0, 1): 0.0099,
        (1, 1): 0.89,
        (2, 1): 0.089,
        (1, 2): 0.089,
        (2, 2): 0.011,
    }

    AVERAGE_CHARACTERS = 1
    VARIANCE_CHARACTERS = 6.8


[docs]def trace(backlinks, source, target):
    links = []
    pos = (len(source), len(target))

    while pos != (0, 0):
        s, t = backlinks[pos]
        for i in range(s):
            for j in range(t):
                links.append((pos[0] - i - 1, pos[1] - j - 1))
        pos = (pos[0] - s, pos[1] - t)

    return links[::-1]


[docs]def align_log_prob(i, j, source_sents, target_sents, alignment, params):
    """Returns the log probability of the two sentences C{source_sents[i]}, C{target_sents[j]}
    being aligned with a specific C{alignment}.

    @param i: The offset of the source sentence.
    @param j: The offset of the target sentence.
    @param source_sents: The list of source sentence lengths.
    @param target_sents: The list of target sentence lengths.
    @param alignment: The alignment type, a tuple of two integers.
    @param params: The sentence alignment parameters.

    @returns: The log probability of a specific alignment between the two sentences, given the parameters.
    """
    l_s = sum(source_sents[i - offset - 1] for offset in range(alignment[0]))
    l_t = sum(target_sents[j - offset - 1] for offset in range(alignment[1]))
    try:
        # actually, the paper says l_s * params.VARIANCE_CHARACTERS, this is based on the C
        # reference implementation. With l_s in the denominator, insertions are impossible.
        m = (l_s + l_t / params.AVERAGE_CHARACTERS) / 2
        delta = (l_s * params.AVERAGE_CHARACTERS - l_t) / math.sqrt(m * params.VARIANCE_CHARACTERS)
    except ZeroDivisionError:
        return float('-inf')

    return - (LOG2 + norm_logsf(abs(delta)) + math.log(params.PRIORS[alignment]))


[docs]def align_blocks(source_sents, target_sents, params = LanguageIndependent):
    """Return the sentence alignment of two text blocks (usually paragraphs).

        >>> align_blocks([5,5,5], [7,7,7])
        [(0, 0), (1, 1), (2, 2)]
        >>> align_blocks([10,5,5], [12,20])
        [(0, 0), (1, 1), (2, 1)]
        >>> align_blocks([12,20], [10,5,5])
        [(0, 0), (1, 1), (1, 2)]
        >>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12])
        [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)]

    @param source_sents: The list of source sentence lengths.
    @param target_sents: The list of target sentence lengths.
    @param params: the sentence alignment parameters.
    @return: The sentence alignments, a list of index pairs.
    """

    alignment_types = list(params.PRIORS.keys())

    # there are always three rows in the history (with the last of them being filled)
    D = [[]]

    backlinks = {}

    for i in range(len(source_sents) + 1):
        for j in range(len(target_sents) + 1):
            min_dist = float('inf')
            min_align = None
            for a in alignment_types:
                prev_i = - 1 - a[0]
                prev_j = j - a[1]
                if prev_i < -len(D) or prev_j < 0:
                    continue
                p = D[prev_i][prev_j] + align_log_prob(i, j, source_sents, target_sents, a, params)
                if p < min_dist:
                    min_dist = p
                    min_align = a

            if min_dist == float('inf'):
                min_dist = 0

            backlinks[(i, j)] = min_align
            D[-1].append(min_dist)

        if len(D) > 2:
            D.pop(0)
        D.append([])

    return trace(backlinks, source_sents, target_sents)


[docs]def align_texts(source_blocks, target_blocks, params = LanguageIndependent):
    """Creates the sentence alignment of two texts.

    Texts can consist of several blocks. Block boundaries cannot be crossed by sentence 
    alignment links. 

    Each block consists of a list that contains the lengths (in characters) of the sentences
    in this block.
    
    @param source_blocks: The list of blocks in the source text.
    @param target_blocks: The list of blocks in the target text.
    @param params: the sentence alignment parameters.

    @returns: A list of sentence alignment lists
    """
    if len(source_blocks) != len(target_blocks):
        raise ValueError("Source and target texts do not have the same number of blocks.")
    
    return [align_blocks(source_block, target_block, params) 
            for source_block, target_block in zip(source_blocks, target_blocks)]


# File I/O functions; may belong in a corpus reader

[docs]def split_at(it, split_value):
    """Splits an iterator C{it} at values of C{split_value}. 

    Each instance of C{split_value} is swallowed. The iterator produces
    subiterators which need to be consumed fully before the next subiterator
    can be used.
    """
    def _chunk_iterator(first):
        v = first
        while v != split_value:
            yield v
            v = it.next()
    
    while True:
        yield _chunk_iterator(it.next())
        

[docs]def parse_token_stream(stream, soft_delimiter, hard_delimiter):
    """Parses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens) 
    and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function.
    """
    return [
        [sum(len(token) for token in sentence_it) 
         for sentence_it in split_at(block_it, soft_delimiter)]
        for block_it in split_at(stream, hard_delimiter)]




#    Code for test files in nltk_contrib/align/data/*.tok
#    import sys
#    from contextlib import nested
#    with nested(open(sys.argv[1], "r"), open(sys.argv[2], "r")) as (s, t):
#        source = parse_token_stream((l.strip() for l in s), ".EOS", ".EOP")
#        target = parse_token_stream((l.strip() for l in t), ".EOS", ".EOP")
#        print align_texts(source, target)
Source code for nltk.align.gale_church

Table Of Contents

Search