Source code for nltk.lm.models

# Natural Language Toolkit: Language Models
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
#         Manu Joseph <manujosephv@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""Language Models"""

from nltk.lm.api import LanguageModel, Smoothing
from nltk.lm.smoothing import AbsoluteDiscounting, KneserNey, WittenBell


[docs]class MLE(LanguageModel): """Class for providing MLE ngram model scores. Inherits initialization from BaseNgramModel. """
[docs] def unmasked_score(self, word, context=None): """Returns the MLE score for a word given a context. Args: - word is expected to be a string - context is expected to be something reasonably convertible to a tuple """ return self.context_counts(context).freq(word)
[docs]class Lidstone(LanguageModel): """Provides Lidstone-smoothed scores. In addition to initialization arguments from BaseNgramModel also requires a number by which to increase the counts, gamma. """
[docs] def __init__(self, gamma, *args, **kwargs): super().__init__(*args, **kwargs) self.gamma = gamma
[docs] def unmasked_score(self, word, context=None): """Add-one smoothing: Lidstone or Laplace. To see what kind, look at `gamma` attribute on the class. """ counts = self.context_counts(context) word_count = counts[word] norm_count = counts.N() return (word_count + self.gamma) / (norm_count + len(self.vocab) * self.gamma)
[docs]class Laplace(Lidstone): """Implements Laplace (add one) smoothing. Initialization identical to BaseNgramModel because gamma is always 1. """
[docs] def __init__(self, *args, **kwargs): super().__init__(1, *args, **kwargs)
[docs]class StupidBackoff(LanguageModel): """Provides StupidBackoff scores. In addition to initialization arguments from BaseNgramModel also requires a parameter alpha with which we scale the lower order probabilities. Note that this is not a true probability distribution as scores for ngrams of the same order do not sum up to unity. """
[docs] def __init__(self, alpha=0.4, *args, **kwargs): super().__init__(*args, **kwargs) self.alpha = alpha
[docs] def unmasked_score(self, word, context=None): if not context: # Base recursion return self.counts.unigrams.freq(word) counts = self.context_counts(context) word_count = counts[word] norm_count = counts.N() if word_count > 0: return word_count / norm_count else: return self.alpha * self.unmasked_score(word, context[1:])
[docs]class InterpolatedLanguageModel(LanguageModel): """Logic common to all interpolated language models. The idea to abstract this comes from Chen & Goodman 1995. Do not instantiate this class directly! """
[docs] def __init__(self, smoothing_cls, order, **kwargs): params = kwargs.pop("params", {}) super().__init__(order, **kwargs) self.estimator = smoothing_cls(self.vocab, self.counts, **params)
[docs] def unmasked_score(self, word, context=None): if not context: # The base recursion case: no context, we only have a unigram. return self.estimator.unigram_score(word) if not self.counts[context]: # It can also happen that we have no data for this context. # In that case we defer to the lower-order ngram. # This is the same as setting alpha to 0 and gamma to 1. alpha, gamma = 0, 1 else: alpha, gamma = self.estimator.alpha_gamma(word, context) return alpha + gamma * self.unmasked_score(word, context[1:])
[docs]class WittenBellInterpolated(InterpolatedLanguageModel): """Interpolated version of Witten-Bell smoothing."""
[docs] def __init__(self, order, **kwargs): super().__init__(WittenBell, order, **kwargs)
[docs]class AbsoluteDiscountingInterpolated(InterpolatedLanguageModel): """Interpolated version of smoothing with absolute discount."""
[docs] def __init__(self, order, discount=0.75, **kwargs): super().__init__( AbsoluteDiscounting, order, params={"discount": discount}, **kwargs )
[docs]class KneserNeyInterpolated(InterpolatedLanguageModel): """Interpolated version of Kneser-Ney smoothing."""
[docs] def __init__(self, order, discount=0.1, **kwargs): if not (0 <= discount <= 1): raise ValueError( "Discount must be between 0 and 1 for probabilities to sum to unity." ) super().__init__( KneserNey, order, params={"discount": discount, "order": order}, **kwargs )