# Source code for nltk.classify.maxent

```# Natural Language Toolkit: Maximum Entropy Classifiers
#
# Copyright (C) 2001-2022 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
#         Dmitry Chichkov <dchichkov@gmail.com> (TypedMaxentFeatureEncoding)
# URL: <https://www.nltk.org/>

"""
A classifier model based on maximum entropy modeling framework.  This
framework considers all of the probability distributions that are
empirically consistent with the training data; and chooses the
distribution with the highest entropy.  A probability distribution is
"empirically consistent" with a set of training data if its estimated
frequency with which a class and a feature vector value co-occur is
equal to the actual frequency in the data.

Terminology: 'feature'
======================
The term *feature* is usually used to refer to some property of an
unlabeled token.  For example, when performing word sense
disambiguation, we might define a ``'prevword'`` feature whose value is
the word preceding the target word.  However, in the context of
maxent modeling, the term *feature* is typically used to refer to a
property of a "labeled" token.  In order to prevent confusion, we
will introduce two distinct terms to disambiguate these two different
concepts:

- An "input-feature" is a property of an unlabeled token.
- A "joint-feature" is a property of a labeled token.

In the rest of the ``nltk.classify`` module, the term "features" is
used to refer to what we will call "input-features" in this module.

In literature that describes and discusses maximum entropy models,
input-features are typically called "contexts", and joint-features
are simply referred to as "features".

Converting Input-Features to Joint-Features
-------------------------------------------
In maximum entropy models, joint-features are required to have numeric
values.  Typically, each input-feature ``input_feat`` is mapped to a
set of joint-features of the form:

|   joint_feat(token, label) = { 1 if input_feat(token) == feat_val
|                              {      and label == some_label
|                              {
|                              { 0 otherwise

For all values of ``feat_val`` and ``some_label``.  This mapping is
performed by classes that implement the ``MaxentFeatureEncodingI``
interface.
"""
try:
import numpy
except ImportError:
pass

import os
import tempfile
from collections import defaultdict

from nltk.classify.api import ClassifierI
from nltk.classify.megam import call_megam, parse_megam_weights, write_megam_file
from nltk.classify.util import CutoffChecker, accuracy, log_likelihood
from nltk.data import gzip_open_unicode
from nltk.probability import DictionaryProbDist
from nltk.util import OrderedDict

__docformat__ = "epytext en"

######################################################################
# { Classifier Model
######################################################################

[docs]class MaxentClassifier(ClassifierI):
"""
A maximum entropy classifier (also known as a "conditional
exponential classifier").  This classifier is parameterized by a
set of "weights", which are used to combine the joint-features
that are generated from a featureset by an "encoding".  In
particular, the encoding maps each ``(featureset, label)`` pair to
a vector.  The probability of each label is then computed using
the following equation::

dotprod(weights, encode(fs,label))
prob(fs|label) = ---------------------------------------------------
sum(dotprod(weights, encode(fs,l)) for l in labels)

Where ``dotprod`` is the dot product::

dotprod(a,b) = sum(x*y for (x,y) in zip(a,b))
"""

[docs]    def __init__(self, encoding, weights, logarithmic=True):
"""
Construct a new maxent classifier model.  Typically, new
classifier models are created using the ``train()`` method.

:type encoding: MaxentFeatureEncodingI
:param encoding: An encoding that is used to convert the
featuresets that are given to the ``classify`` method into
joint-feature vectors, which are used by the maxent
classifier model.

:type weights: list of float
:param weights:  The feature weight vector for this classifier.

:type logarithmic: bool
:param logarithmic: If false, then use non-logarithmic weights.
"""
self._encoding = encoding
self._weights = weights
self._logarithmic = logarithmic
# self._logarithmic = False
assert encoding.length() == len(weights)

[docs]    def labels(self):
return self._encoding.labels()

[docs]    def set_weights(self, new_weights):
"""
Set the feature weight vector for this classifier.
:param new_weights: The new feature weight vector.
:type new_weights: list of float
"""
self._weights = new_weights
assert self._encoding.length() == len(new_weights)

[docs]    def weights(self):
"""
:return: The feature weight vector for this classifier.
:rtype: list of float
"""
return self._weights

[docs]    def classify(self, featureset):
return self.prob_classify(featureset).max()

[docs]    def prob_classify(self, featureset):
prob_dict = {}
for label in self._encoding.labels():
feature_vector = self._encoding.encode(featureset, label)

if self._logarithmic:
total = 0.0
for (f_id, f_val) in feature_vector:
total += self._weights[f_id] * f_val
prob_dict[label] = total

else:
prod = 1.0
for (f_id, f_val) in feature_vector:
prod *= self._weights[f_id] ** f_val
prob_dict[label] = prod

# Normalize the dictionary to give a probability distribution
return DictionaryProbDist(prob_dict, log=self._logarithmic, normalize=True)

[docs]    def explain(self, featureset, columns=4):
"""
Print a table showing the effect of each of the features in
the given feature set, and how they combine to determine the
probabilities of each label for that featureset.
"""
descr_width = 50
TEMPLATE = "  %-" + str(descr_width - 2) + "s%s%8.3f"

pdist = self.prob_classify(featureset)
labels = sorted(pdist.samples(), key=pdist.prob, reverse=True)
labels = labels[:columns]
print(
"  Feature".ljust(descr_width)
+ "".join("%8s" % (("%s" % l)[:7]) for l in labels)
)
print("  " + "-" * (descr_width - 2 + 8 * len(labels)))
sums = defaultdict(int)
for i, label in enumerate(labels):
feature_vector = self._encoding.encode(featureset, label)
feature_vector.sort(
key=lambda fid__: abs(self._weights[fid__]), reverse=True
)
for (f_id, f_val) in feature_vector:
if self._logarithmic:
score = self._weights[f_id] * f_val
else:
score = self._weights[f_id] ** f_val
descr = self._encoding.describe(f_id)
descr = descr.split(" and label is ")  # hack
descr += " (%s)" % f_val  # hack
if len(descr) > 47:
descr = descr[:44] + "..."
print(TEMPLATE % (descr, i * 8 * " ", score))
sums[label] += score
print("  " + "-" * (descr_width - 1 + 8 * len(labels)))
print(
"  TOTAL:".ljust(descr_width) + "".join("%8.3f" % sums[l] for l in labels)
)
print(
"  PROBS:".ljust(descr_width)
+ "".join("%8.3f" % pdist.prob(l) for l in labels)
)

[docs]    def most_informative_features(self, n=10):
"""
Generates the ranked list of informative features from most to least.
"""
if hasattr(self, "_most_informative_features"):
return self._most_informative_features[:n]
else:
self._most_informative_features = sorted(
list(range(len(self._weights))),
key=lambda fid: abs(self._weights[fid]),
reverse=True,
)
return self._most_informative_features[:n]

[docs]    def show_most_informative_features(self, n=10, show="all"):
"""
:param show: all, neg, or pos (for negative-only or positive-only)
:type show: str
:param n: The no. of top features
:type n: int
"""
# Use None the full list of ranked features.
fids = self.most_informative_features(None)
if show == "pos":
fids = [fid for fid in fids if self._weights[fid] > 0]
elif show == "neg":
fids = [fid for fid in fids if self._weights[fid] < 0]
for fid in fids[:n]:
print(f"{self._weights[fid]:8.3f} {self._encoding.describe(fid)}")

def __repr__(self):
return "<ConditionalExponentialClassifier: %d labels, %d features>" % (
len(self._encoding.labels()),
self._encoding.length(),
)

#: A list of the algorithm names that are accepted for the
#: ``train()`` method's ``algorithm`` parameter.
ALGORITHMS = ["GIS", "IIS", "MEGAM", "TADM"]

[docs]    @classmethod
def train(
cls,
train_toks,
algorithm=None,
trace=3,
encoding=None,
labels=None,
gaussian_prior_sigma=0,
**cutoffs,
):
"""
Train a new maxent classifier based on the given corpus of
training samples.  This classifier will have its weights
chosen to maximize entropy while remaining empirically
consistent with the training corpus.

:rtype: MaxentClassifier
:return: The new maxent classifier

:type train_toks: list
:param train_toks: Training data, represented as a list of
pairs, the first member of which is a featureset,
and the second of which is a classification label.

:type algorithm: str
:param algorithm: A case-insensitive string, specifying which
algorithm should be used to train the classifier.  The
following algorithms are currently available.

- Iterative Scaling Methods: Generalized Iterative Scaling (``'GIS'``),
Improved Iterative Scaling (``'IIS'``)
- External Libraries (requiring megam):
LM-BFGS algorithm, with training performed by Megam (``'megam'``)

The default algorithm is ``'IIS'``.

:type trace: int
:param trace: The level of diagnostic tracing output to produce.
Higher values produce more verbose output.
:type encoding: MaxentFeatureEncodingI
:param encoding: A feature encoding, used to convert featuresets
into feature vectors.  If none is specified, then a
``BinaryMaxentFeatureEncoding`` will be built based on the
features that are attested in the training corpus.
:type labels: list(str)
:param labels: The set of possible labels.  If none is given, then
the set of all labels attested in the training data will be
:param gaussian_prior_sigma: The sigma value for a gaussian
prior on model weights.  Currently, this is supported by
``megam``. For other algorithms, its value is ignored.
:param cutoffs: Arguments specifying various conditions under
which the training should be halted.  (Some of the cutoff
conditions are not supported by some algorithms.)

- ``max_iter=v``: Terminate after ``v`` iterations.
- ``min_ll=v``: Terminate after the negative average
log-likelihood drops under ``v``.
- ``min_lldelta=v``: Terminate if a single iteration improves
log likelihood by less than ``v``.
"""
if algorithm is None:
algorithm = "iis"
for key in cutoffs:
if key not in (
"max_iter",
"min_ll",
"min_lldelta",
"max_acc",
"min_accdelta",
"count_cutoff",
"norm",
"explicit",
"bernoulli",
):
raise TypeError("Unexpected keyword arg %r" % key)
algorithm = algorithm.lower()
if algorithm == "iis":
return train_maxent_classifier_with_iis(
train_toks, trace, encoding, labels, **cutoffs
)
elif algorithm == "gis":
return train_maxent_classifier_with_gis(
train_toks, trace, encoding, labels, **cutoffs
)
elif algorithm == "megam":
return train_maxent_classifier_with_megam(
train_toks, trace, encoding, labels, gaussian_prior_sigma, **cutoffs
)
kwargs = cutoffs
kwargs["trace"] = trace
kwargs["encoding"] = encoding
kwargs["labels"] = labels
kwargs["gaussian_prior_sigma"] = gaussian_prior_sigma
else:
raise ValueError("Unknown algorithm %s" % algorithm)

#: Alias for MaxentClassifier.
ConditionalExponentialClassifier = MaxentClassifier

######################################################################
# { Feature Encodings
######################################################################

[docs]class MaxentFeatureEncodingI:
"""
A mapping that converts a set of input-feature values to a vector
of joint-feature values, given a label.  This conversion is
necessary to translate featuresets into a format that can be used
by maximum entropy models.

The set of joint-features used by a given encoding is fixed, and
each index in the generated joint-feature vectors corresponds to a
single joint-feature.  The length of the generated joint-feature
vectors is therefore constant (for a given encoding).

Because the joint-feature vectors generated by
``MaxentFeatureEncodingI`` are typically very sparse, they are
represented as a list of ``(index, value)`` tuples, specifying the
value of each non-zero joint-feature.

Feature encodings are generally created using the ``train()``
method, which generates an appropriate encoding based on the
input-feature values and labels that are present in a given
corpus.
"""

[docs]    def encode(self, featureset, label):
"""
Given a (featureset, label) pair, return the corresponding
vector of joint-feature values.  This vector is represented as
a list of ``(index, value)`` tuples, specifying the value of
each non-zero joint-feature.

:type featureset: dict
:rtype: list(tuple(int, int))
"""
raise NotImplementedError()

[docs]    def length(self):
"""
:return: The size of the fixed-length joint-feature vectors
that are generated by this encoding.
:rtype: int
"""
raise NotImplementedError()

[docs]    def labels(self):
"""
:return: A list of the \"known labels\" -- i.e., all labels
``l`` such that ``self.encode(fs,l)`` can be a nonzero
joint-feature vector for some value of ``fs``.
:rtype: list
"""
raise NotImplementedError()

[docs]    def describe(self, fid):
"""
:return: A string describing the value of the joint-feature
whose index in the generated feature vectors is ``fid``.
:rtype: str
"""
raise NotImplementedError()

[docs]    def train(cls, train_toks):
"""
Construct and return new feature encoding, based on a given
training corpus ``train_toks``.

:type train_toks: list(tuple(dict, str))
:param train_toks: Training data, represented as a list of
pairs, the first member of which is a feature dictionary,
and the second of which is a classification label.
"""
raise NotImplementedError()

[docs]class FunctionBackedMaxentFeatureEncoding(MaxentFeatureEncodingI):
"""
A feature encoding that calls a user-supplied function to map a
given featureset/label pair to a sparse joint-feature vector.
"""

[docs]    def __init__(self, func, length, labels):
"""
Construct a new feature encoding based on the given function.

:type func: (callable)
:param func: A function that takes two arguments, a featureset
and a label, and returns the sparse joint feature vector
that encodes them::

func(featureset, label) -> feature_vector

This sparse joint feature vector (``feature_vector``) is a
list of ``(index,value)`` tuples.

:type length: int
:param length: The size of the fixed-length joint-feature
vectors that are generated by this encoding.

:type labels: list
:param labels: A list of the \"known labels\" for this
encoding -- i.e., all labels ``l`` such that
``self.encode(fs,l)`` can be a nonzero joint-feature vector
for some value of ``fs``.
"""
self._length = length
self._func = func
self._labels = labels

[docs]    def encode(self, featureset, label):
return self._func(featureset, label)

[docs]    def length(self):
return self._length

[docs]    def labels(self):
return self._labels

[docs]    def describe(self, fid):
return "no description available"

[docs]class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI):
"""
A feature encoding that generates vectors containing a binary
joint-features of the form:

|  joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
|                      {
|                      { 0 otherwise

Where ``fname`` is the name of an input-feature, ``fval`` is a value
for that input-feature, and ``label`` is a label.

Typically, these features are constructed based on a training
corpus, using the ``train()`` method.  This method will create one
feature for each combination of ``fname``, ``fval``, and ``label``
that occurs at least once in the training corpus.

The ``unseen_features`` parameter can be used to add "unseen-value
features", which are used whenever an input feature has a value
that was not encountered in the training corpus.  These features
have the form:

|  joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
|                      {      and l == label
|                      {
|                      { 0 otherwise

Where ``is_unseen(fname, fval)`` is true if the encoding does not
contain any joint features that are true when ``fs[fname]==fval``.

The ``alwayson_features`` parameter can be used to add "always-on
features", which have the form::

|  joint_feat(fs, l) = { 1 if (l == label)
|                      {
|                      { 0 otherwise

These always-on features allow the maxent model to directly model
the prior probabilities of each label.
"""

[docs]    def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
"""
:param labels: A list of the \"known labels\" for this encoding.

:param mapping: A dictionary mapping from ``(fname,fval,label)``
tuples to corresponding joint-feature indexes.  These
indexes must be the set of integers from 0...len(mapping).
If ``mapping[fname,fval,label]=id``, then
``self.encode(..., fname:fval, ..., label)[id]`` is 1;
otherwise, it is 0.

:param unseen_features: If true, then include unseen value
features in the generated joint-feature vectors.

:param alwayson_features: If true, then include always-on
features in the generated joint-feature vectors.
"""
if set(mapping.values()) != set(range(len(mapping))):
raise ValueError(
"Mapping values must be exactly the "
"set of integers from 0...len(mapping)"
)

self._labels = list(labels)
"""A list of attested labels."""

self._mapping = mapping
"""dict mapping from (fname,fval,label) -> fid"""

self._length = len(mapping)
"""The length of generated joint feature vectors."""

self._alwayson = None
"""dict mapping from label -> fid"""

self._unseen = None
"""dict mapping from fname -> fid"""

if alwayson_features:
self._alwayson = {
label: i + self._length for (i, label) in enumerate(labels)
}
self._length += len(self._alwayson)

if unseen_features:
fnames = {fname for (fname, fval, label) in mapping}
self._unseen = {fname: i + self._length for (i, fname) in enumerate(fnames)}
self._length += len(fnames)

[docs]    def encode(self, featureset, label):
# Inherit docs.
encoding = []

# Convert input-features to joint-features:
for fname, fval in featureset.items():
# Known feature name & value:
if (fname, fval, label) in self._mapping:
encoding.append((self._mapping[fname, fval, label], 1))

# Otherwise, we might want to fire an "unseen-value feature".
elif self._unseen:
# Have we seen this fname/fval combination with any label?
for label2 in self._labels:
if (fname, fval, label2) in self._mapping:
break  # we've seen this fname/fval combo
# We haven't -- fire the unseen-value feature
else:
if fname in self._unseen:
encoding.append((self._unseen[fname], 1))

if self._alwayson and label in self._alwayson:
encoding.append((self._alwayson[label], 1))

return encoding

[docs]    def describe(self, f_id):
# Inherit docs.
if not isinstance(f_id, int):
raise TypeError("describe() expected an int")
try:
self._inv_mapping
except AttributeError:
self._inv_mapping = [-1] * len(self._mapping)
for (info, i) in self._mapping.items():
self._inv_mapping[i] = info

if f_id < len(self._mapping):
(fname, fval, label) = self._inv_mapping[f_id]
return f"{fname}=={fval!r} and label is {label!r}"
elif self._alwayson and f_id in self._alwayson.values():
for (label, f_id2) in self._alwayson.items():
if f_id == f_id2:
return "label is %r" % label
elif self._unseen and f_id in self._unseen.values():
for (fname, f_id2) in self._unseen.items():
if f_id == f_id2:
return "%s is unseen" % fname
else:

[docs]    def labels(self):
# Inherit docs.
return self._labels

[docs]    def length(self):
# Inherit docs.
return self._length

[docs]    @classmethod
def train(cls, train_toks, count_cutoff=0, labels=None, **options):
"""
Construct and return new feature encoding, based on a given
training corpus ``train_toks``.  See the class description
``BinaryMaxentFeatureEncoding`` for a description of the
joint-features that will be included in this encoding.

:type train_toks: list(tuple(dict, str))
:param train_toks: Training data, represented as a list of
pairs, the first member of which is a feature dictionary,
and the second of which is a classification label.

:type count_cutoff: int
:param count_cutoff: A cutoff value that is used to discard
rare joint-features.  If a joint-feature's value is 1
fewer than ``count_cutoff`` times in the training corpus,
then that joint-feature is not included in the generated
encoding.

:type labels: list
:param labels: A list of labels that should be used by the
classifier.  If not specified, then the set of labels
attested in ``train_toks`` will be used.

:param options: Extra parameters for the constructor, such as
``unseen_features`` and ``alwayson_features``.
"""
mapping = {}  # maps (fname, fval, label) -> fid
seen_labels = set()  # The set of labels we've encountered
count = defaultdict(int)  # maps (fname, fval) -> count

for (tok, label) in train_toks:
if labels and label not in labels:
raise ValueError("Unexpected label %s" % label)

# Record each of the features.
for (fname, fval) in tok.items():

# If a count cutoff is given, then only add a joint
# feature once the corresponding (fname, fval, label)
# tuple exceeds that cutoff.
count[fname, fval] += 1
if count[fname, fval] >= count_cutoff:
if (fname, fval, label) not in mapping:
mapping[fname, fval, label] = len(mapping)

if labels is None:
labels = seen_labels
return cls(labels, mapping, **options)

[docs]class GISEncoding(BinaryMaxentFeatureEncoding):
"""
A binary feature encoding which adds one new joint-feature to the
joint-features defined by ``BinaryMaxentFeatureEncoding``: a
correction feature, whose value is chosen to ensure that the
sparse vector always sums to a constant non-negative number.  This
new feature is used to ensure two preconditions for the GIS
training algorithm:

- At least one feature vector index must be nonzero for every
token.
- The feature vector must sum to a constant non-negative number
for every token.
"""

[docs]    def __init__(
self, labels, mapping, unseen_features=False, alwayson_features=False, C=None
):
"""
:param C: The correction constant.  The value of the correction
feature is based on this value.  In particular, its value is
``C - sum([v for (f,v) in encoding])``.
:seealso: ``BinaryMaxentFeatureEncoding.__init__``
"""
BinaryMaxentFeatureEncoding.__init__(
self, labels, mapping, unseen_features, alwayson_features
)
if C is None:
C = len({fname for (fname, fval, label) in mapping}) + 1
self._C = C

@property
def C(self):
"""The non-negative constant that all encoded feature vectors
will sum to."""
return self._C

[docs]    def encode(self, featureset, label):
# Get the basic encoding.
encoding = BinaryMaxentFeatureEncoding.encode(self, featureset, label)
base_length = BinaryMaxentFeatureEncoding.length(self)

total = sum(v for (f, v) in encoding)
if total >= self._C:
raise ValueError("Correction feature is not high enough!")
encoding.append((base_length, self._C - total))

# Return the result
return encoding

[docs]    def length(self):
return BinaryMaxentFeatureEncoding.length(self) + 1

[docs]    def describe(self, f_id):
if f_id == BinaryMaxentFeatureEncoding.length(self):
return "Correction feature (%s)" % self._C
else:
return BinaryMaxentFeatureEncoding.describe(self, f_id)

[docs]    def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
self._mapping = OrderedDict(mapping)
self._label_mapping = OrderedDict()
BinaryMaxentFeatureEncoding.__init__(
self, labels, self._mapping, unseen_features, alwayson_features
)

[docs]    def encode(self, featureset, label):
encoding = []
for feature, value in featureset.items():
if (feature, label) not in self._mapping:
self._mapping[(feature, label)] = len(self._mapping)
if value not in self._label_mapping:
if not isinstance(value, int):
self._label_mapping[value] = len(self._label_mapping)
else:
self._label_mapping[value] = value
encoding.append(
(self._mapping[(feature, label)], self._label_mapping[value])
)
return encoding

[docs]    def labels(self):
return self._labels

[docs]    def describe(self, fid):
for (feature, label) in self._mapping:
if self._mapping[(feature, label)] == fid:
return (feature, label)

[docs]    def length(self):
return len(self._mapping)

[docs]    @classmethod
def train(cls, train_toks, count_cutoff=0, labels=None, **options):
mapping = OrderedDict()
if not labels:
labels = []

# This gets read twice, so compute the values in case it's lazy.
train_toks = list(train_toks)

for (featureset, label) in train_toks:
if label not in labels:
labels.append(label)

for (featureset, label) in train_toks:
for label in labels:
for feature in featureset:
if (feature, label) not in mapping:
mapping[(feature, label)] = len(mapping)

return cls(labels, mapping, **options)

[docs]class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI):
"""
A feature encoding that generates vectors containing integer,
float and binary joint-features of the form:

Binary (for string and boolean features):

|  joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
|                      {
|                      { 0 otherwise

Value (for integer and float features):

|  joint_feat(fs, l) = { fval if     (fs[fname] == type(fval))
|                      {         and (l == label)
|                      {
|                      { not encoded otherwise

Where ``fname`` is the name of an input-feature, ``fval`` is a value
for that input-feature, and ``label`` is a label.

Typically, these features are constructed based on a training
corpus, using the ``train()`` method.

For string and boolean features [type(fval) not in (int, float)]
this method will create one feature for each combination of
``fname``, ``fval``, and ``label`` that occurs at least once in the
training corpus.

For integer and float features [type(fval) in (int, float)] this
method will create one feature for each combination of ``fname``
and ``label`` that occurs at least once in the training corpus.

For binary features the ``unseen_features`` parameter can be used
to add "unseen-value features", which are used whenever an input
feature has a value that was not encountered in the training
corpus.  These features have the form:

|  joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
|                      {      and l == label
|                      {
|                      { 0 otherwise

Where ``is_unseen(fname, fval)`` is true if the encoding does not
contain any joint features that are true when ``fs[fname]==fval``.

The ``alwayson_features`` parameter can be used to add "always-on
features", which have the form:

|  joint_feat(fs, l) = { 1 if (l == label)
|                      {
|                      { 0 otherwise

These always-on features allow the maxent model to directly model
the prior probabilities of each label.
"""

[docs]    def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
"""
:param labels: A list of the \"known labels\" for this encoding.

:param mapping: A dictionary mapping from ``(fname,fval,label)``
tuples to corresponding joint-feature indexes.  These
indexes must be the set of integers from 0...len(mapping).
If ``mapping[fname,fval,label]=id``, then
``self.encode({..., fname:fval, ...``, label)[id]} is 1;
otherwise, it is 0.

:param unseen_features: If true, then include unseen value
features in the generated joint-feature vectors.

:param alwayson_features: If true, then include always-on
features in the generated joint-feature vectors.
"""
if set(mapping.values()) != set(range(len(mapping))):
raise ValueError(
"Mapping values must be exactly the "
"set of integers from 0...len(mapping)"
)

self._labels = list(labels)
"""A list of attested labels."""

self._mapping = mapping
"""dict mapping from (fname,fval,label) -> fid"""

self._length = len(mapping)
"""The length of generated joint feature vectors."""

self._alwayson = None
"""dict mapping from label -> fid"""

self._unseen = None
"""dict mapping from fname -> fid"""

if alwayson_features:
self._alwayson = {
label: i + self._length for (i, label) in enumerate(labels)
}
self._length += len(self._alwayson)

if unseen_features:
fnames = {fname for (fname, fval, label) in mapping}
self._unseen = {fname: i + self._length for (i, fname) in enumerate(fnames)}
self._length += len(fnames)

[docs]    def encode(self, featureset, label):
# Inherit docs.
encoding = []

# Convert input-features to joint-features:
for fname, fval in featureset.items():
if isinstance(fval, (int, float)):
# Known feature name & value:
if (fname, type(fval), label) in self._mapping:
encoding.append((self._mapping[fname, type(fval), label], fval))
else:
# Known feature name & value:
if (fname, fval, label) in self._mapping:
encoding.append((self._mapping[fname, fval, label], 1))

# Otherwise, we might want to fire an "unseen-value feature".
elif self._unseen:
# Have we seen this fname/fval combination with any label?
for label2 in self._labels:
if (fname, fval, label2) in self._mapping:
break  # we've seen this fname/fval combo
# We haven't -- fire the unseen-value feature
else:
if fname in self._unseen:
encoding.append((self._unseen[fname], 1))

if self._alwayson and label in self._alwayson:
encoding.append((self._alwayson[label], 1))

return encoding

[docs]    def describe(self, f_id):
# Inherit docs.
if not isinstance(f_id, int):
raise TypeError("describe() expected an int")
try:
self._inv_mapping
except AttributeError:
self._inv_mapping = [-1] * len(self._mapping)
for (info, i) in self._mapping.items():
self._inv_mapping[i] = info

if f_id < len(self._mapping):
(fname, fval, label) = self._inv_mapping[f_id]
return f"{fname}=={fval!r} and label is {label!r}"
elif self._alwayson and f_id in self._alwayson.values():
for (label, f_id2) in self._alwayson.items():
if f_id == f_id2:
return "label is %r" % label
elif self._unseen and f_id in self._unseen.values():
for (fname, f_id2) in self._unseen.items():
if f_id == f_id2:
return "%s is unseen" % fname
else:

[docs]    def labels(self):
# Inherit docs.
return self._labels

[docs]    def length(self):
# Inherit docs.
return self._length

[docs]    @classmethod
def train(cls, train_toks, count_cutoff=0, labels=None, **options):
"""
Construct and return new feature encoding, based on a given
training corpus ``train_toks``.  See the class description
``TypedMaxentFeatureEncoding`` for a description of the
joint-features that will be included in this encoding.

Note: recognized feature values types are (int, float), over
types are interpreted as regular binary features.

:type train_toks: list(tuple(dict, str))
:param train_toks: Training data, represented as a list of
pairs, the first member of which is a feature dictionary,
and the second of which is a classification label.

:type count_cutoff: int
:param count_cutoff: A cutoff value that is used to discard
rare joint-features.  If a joint-feature's value is 1
fewer than ``count_cutoff`` times in the training corpus,
then that joint-feature is not included in the generated
encoding.

:type labels: list
:param labels: A list of labels that should be used by the
classifier.  If not specified, then the set of labels
attested in ``train_toks`` will be used.

:param options: Extra parameters for the constructor, such as
``unseen_features`` and ``alwayson_features``.
"""
mapping = {}  # maps (fname, fval, label) -> fid
seen_labels = set()  # The set of labels we've encountered
count = defaultdict(int)  # maps (fname, fval) -> count

for (tok, label) in train_toks:
if labels and label not in labels:
raise ValueError("Unexpected label %s" % label)

# Record each of the features.
for (fname, fval) in tok.items():
if type(fval) in (int, float):
fval = type(fval)
# If a count cutoff is given, then only add a joint
# feature once the corresponding (fname, fval, label)
# tuple exceeds that cutoff.
count[fname, fval] += 1
if count[fname, fval] >= count_cutoff:
if (fname, fval, label) not in mapping:
mapping[fname, fval, label] = len(mapping)

if labels is None:
labels = seen_labels
return cls(labels, mapping, **options)

######################################################################
# { Classifier Trainer: Generalized Iterative Scaling
######################################################################

[docs]def train_maxent_classifier_with_gis(
train_toks, trace=3, encoding=None, labels=None, **cutoffs
):
"""
Train a new ``ConditionalExponentialClassifier``, using the given
training samples, using the Generalized Iterative Scaling
algorithm.  This ``ConditionalExponentialClassifier`` will encode
the model that maximizes entropy from all the models that are
empirically consistent with ``train_toks``.

:see: ``train_maxent_classifier()`` for parameter descriptions.
"""
cutoffs.setdefault("max_iter", 100)
cutoffchecker = CutoffChecker(cutoffs)

# Construct an encoding from the training data.
if encoding is None:
encoding = GISEncoding.train(train_toks, labels=labels)

if not hasattr(encoding, "C"):
raise TypeError(
"The GIS algorithm requires an encoding that "
"defines C (e.g., GISEncoding)."
)

# Cinv is the inverse of the sum of each joint feature vector.
# This controls the learning rate: higher Cinv (or lower C) gives
# faster learning.
Cinv = 1.0 / encoding.C

# Count how many times each feature occurs in the training data.
empirical_fcount = calculate_empirical_fcount(train_toks, encoding)

# Check for any features that are not attested in train_toks.
unattested = set(numpy.nonzero(empirical_fcount == 0))

# feature, and weight=-infinity for each unattested feature.
weights = numpy.zeros(len(empirical_fcount), "d")
for fid in unattested:
weights[fid] = numpy.NINF
classifier = ConditionalExponentialClassifier(encoding, weights)

# Take the log of the empirical fcount.
log_empirical_fcount = numpy.log2(empirical_fcount)
del empirical_fcount

if trace > 0:
print("  ==> Training (%d iterations)" % cutoffs["max_iter"])
if trace > 2:
print()
print("      Iteration    Log Likelihood    Accuracy")
print("      ---------------------------------------")

# Train the classifier.
try:
while True:
if trace > 2:
ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
acc = cutoffchecker.acc or accuracy(classifier, train_toks)
iternum = cutoffchecker.iter
print("     %9d    %14.5f    %9.3f" % (iternum, ll, acc))

# Use the model to estimate the number of times each
# feature should occur in the training data.
estimated_fcount = calculate_estimated_fcount(
classifier, train_toks, encoding
)

# Take the log of estimated fcount (avoid taking log(0).)
for fid in unattested:
estimated_fcount[fid] += 1
log_estimated_fcount = numpy.log2(estimated_fcount)
del estimated_fcount

# Update the classifier weights
weights = classifier.weights()
weights += (log_empirical_fcount - log_estimated_fcount) * Cinv
classifier.set_weights(weights)

# Check the log-likelihood & accuracy cutoffs.
if cutoffchecker.check(classifier, train_toks):
break

except KeyboardInterrupt:
print("      Training stopped: keyboard interrupt")
except:
raise

if trace > 2:
ll = log_likelihood(classifier, train_toks)
acc = accuracy(classifier, train_toks)
print(f"         Final    {ll:14.5f}    {acc:9.3f}")

# Return the classifier.
return classifier

[docs]def calculate_empirical_fcount(train_toks, encoding):
fcount = numpy.zeros(encoding.length(), "d")

for tok, label in train_toks:
for (index, val) in encoding.encode(tok, label):
fcount[index] += val

return fcount

[docs]def calculate_estimated_fcount(classifier, train_toks, encoding):
fcount = numpy.zeros(encoding.length(), "d")

for tok, label in train_toks:
pdist = classifier.prob_classify(tok)
for label in pdist.samples():
prob = pdist.prob(label)
for (fid, fval) in encoding.encode(tok, label):
fcount[fid] += prob * fval

return fcount

######################################################################
# { Classifier Trainer: Improved Iterative Scaling
######################################################################

[docs]def train_maxent_classifier_with_iis(
train_toks, trace=3, encoding=None, labels=None, **cutoffs
):
"""
Train a new ``ConditionalExponentialClassifier``, using the given
training samples, using the Improved Iterative Scaling algorithm.
This ``ConditionalExponentialClassifier`` will encode the model
that maximizes entropy from all the models that are empirically
consistent with ``train_toks``.

:see: ``train_maxent_classifier()`` for parameter descriptions.
"""
cutoffs.setdefault("max_iter", 100)
cutoffchecker = CutoffChecker(cutoffs)

# Construct an encoding from the training data.
if encoding is None:
encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels)

# Count how many times each feature occurs in the training data.
empirical_ffreq = calculate_empirical_fcount(train_toks, encoding) / len(train_toks)

# Find the nf map, and related variables nfarray and nfident.
# nf is the sum of the features for a given labeled text.
# nfmap compresses this sparse set of values to a dense list.
# nfarray performs the reverse operation.  nfident is
# nfarray multiplied by an identity matrix.
nfmap = calculate_nfmap(train_toks, encoding)
nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), "d")
nftranspose = numpy.reshape(nfarray, (len(nfarray), 1))

# Check for any features that are not attested in train_toks.
unattested = set(numpy.nonzero(empirical_ffreq == 0))

# feature, and weight=-infinity for each unattested feature.
weights = numpy.zeros(len(empirical_ffreq), "d")
for fid in unattested:
weights[fid] = numpy.NINF
classifier = ConditionalExponentialClassifier(encoding, weights)

if trace > 0:
print("  ==> Training (%d iterations)" % cutoffs["max_iter"])
if trace > 2:
print()
print("      Iteration    Log Likelihood    Accuracy")
print("      ---------------------------------------")

# Train the classifier.
try:
while True:
if trace > 2:
ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
acc = cutoffchecker.acc or accuracy(classifier, train_toks)
iternum = cutoffchecker.iter
print("     %9d    %14.5f    %9.3f" % (iternum, ll, acc))

# Calculate the deltas for this iteration, using Newton's method.
deltas = calculate_deltas(
train_toks,
classifier,
unattested,
empirical_ffreq,
nfmap,
nfarray,
nftranspose,
encoding,
)

# Use the deltas to update our weights.
weights = classifier.weights()
weights += deltas
classifier.set_weights(weights)

# Check the log-likelihood & accuracy cutoffs.
if cutoffchecker.check(classifier, train_toks):
break

except KeyboardInterrupt:
print("      Training stopped: keyboard interrupt")
except:
raise

if trace > 2:
ll = log_likelihood(classifier, train_toks)
acc = accuracy(classifier, train_toks)
print(f"         Final    {ll:14.5f}    {acc:9.3f}")

# Return the classifier.
return classifier

[docs]def calculate_nfmap(train_toks, encoding):
"""
Construct a map that can be used to compress ``nf`` (which is
typically sparse).

*nf(feature_vector)* is the sum of the feature values for
*feature_vector*.

This represents the number of features that are active for a
given labeled text.  This method finds all values of *nf(t)*
that are attested for at least one token in the given list of
training tokens; and constructs a dictionary mapping these
attested values to a continuous range *0...N*.  For example,
if the only values of *nf()* that were attested were 3, 5, and
7, then ``_nfmap`` might return the dictionary ``{3:0, 5:1, 7:2}``.

:return: A map that can be used to compress ``nf`` to a dense
vector.
:rtype: dict(int -> int)
"""
# Map from nf to indices.  This allows us to use smaller arrays.
nfset = set()
for tok, _ in train_toks:
for label in encoding.labels():
nfset.add(sum(val for (id, val) in encoding.encode(tok, label)))
return {nf: i for (i, nf) in enumerate(nfset)}

[docs]def calculate_deltas(
train_toks,
classifier,
unattested,
ffreq_empirical,
nfmap,
nfarray,
nftranspose,
encoding,
):
r"""
Calculate the update values for the classifier weights for
this iteration of IIS.  These update weights are the value of
``delta`` that solves the equation::

ffreq_empirical[i]
=
SUM[fs,l] (classifier.prob_classify(fs).prob(l) *
feature_vector(fs,l)[i] *
exp(delta[i] * nf(feature_vector(fs,l))))

Where:
- *(fs,l)* is a (featureset, label) tuple from ``train_toks``
- *feature_vector(fs,l)* = ``encoding.encode(fs,l)``
- *nf(vector)* = ``sum([val for (id,val) in vector])``

This method uses Newton's method to solve this equation for
*delta[i]*.  In particular, it starts with a guess of
``delta[i]`` = 1; and iteratively updates ``delta`` with:

| delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i])

until convergence, where *sum1* and *sum2* are defined as:

|    sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta)
|    sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta).nf(feature_vector(fs,l)))
|    f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) .
|                        feature_vector(fs,l)[i] .
|                        exp(delta[i] . nf(feature_vector(fs,l))))

Note that *sum1* and *sum2* depend on ``delta``; so they need
to be re-computed each iteration.

The variables ``nfmap``, ``nfarray``, and ``nftranspose`` are
used to generate a dense encoding for *nf(ltext)*.  This
allows ``_deltas`` to calculate *sum1* and *sum2* using
matrices, which yields a significant performance improvement.

:param train_toks: The set of training tokens.
:type train_toks: list(tuple(dict, str))
:param classifier: The current classifier.
:type classifier: ClassifierI
:param ffreq_empirical: An array containing the empirical
frequency for each feature.  The *i*\ th element of this
array is the empirical frequency for feature *i*.
:type ffreq_empirical: sequence of float
:param unattested: An array that is 1 for features that are
not attested in the training data; and 0 for features that
are attested.  In other words, ``unattested[i]==0`` iff
``ffreq_empirical[i]==0``.
:type unattested: sequence of int
:param nfmap: A map that can be used to compress ``nf`` to a dense
vector.
:type nfmap: dict(int -> int)
:param nfarray: An array that can be used to uncompress ``nf``
from a dense vector.
:type nfarray: array(float)
:param nftranspose: The transpose of ``nfarray``
:type nftranspose: array(float)
"""
# These parameters control when we decide that we've
# converged.  It probably should be possible to set these
# manually, via keyword arguments to train.
NEWTON_CONVERGE = 1e-12
MAX_NEWTON = 300

deltas = numpy.ones(encoding.length(), "d")

# Precompute the A matrix:
# A[nf][id] = sum ( p(fs) * p(label|fs) * f(fs,label) )
# over all label,fs s.t. num_features[label,fs]=nf
A = numpy.zeros((len(nfmap), encoding.length()), "d")

for tok, label in train_toks:
dist = classifier.prob_classify(tok)

for label in encoding.labels():
# Generate the feature vector
feature_vector = encoding.encode(tok, label)
# Find the number of active features
nf = sum(val for (id, val) in feature_vector)
# Update the A matrix
for (id, val) in feature_vector:
A[nfmap[nf], id] += dist.prob(label) * val
A /= len(train_toks)

# Iteratively solve for delta.  Use the following variables:
#   - nf_delta[x][y] = nfarray[x] * delta[y]
#   - exp_nf_delta[x][y] = exp(nf[x] * delta[y])
#   - nf_exp_nf_delta[x][y] = nf[x] * exp(nf[x] * delta[y])
#   - sum1[i][nf] = sum p(fs)p(label|fs)f[i](label,fs)
#                       exp(delta[i]nf)
#   - sum2[i][nf] = sum p(fs)p(label|fs)f[i](label,fs)
#                       nf exp(delta[i]nf)
for rangenum in range(MAX_NEWTON):
nf_delta = numpy.outer(nfarray, deltas)
exp_nf_delta = 2 ** nf_delta
nf_exp_nf_delta = nftranspose * exp_nf_delta
sum1 = numpy.sum(exp_nf_delta * A, axis=0)
sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)

# Avoid division by zero.
for fid in unattested:
sum2[fid] += 1

# Update the deltas.
deltas -= (ffreq_empirical - sum1) / -sum2

# We can stop once we converge.
n_error = numpy.sum(abs(ffreq_empirical - sum1)) / numpy.sum(abs(deltas))
if n_error < NEWTON_CONVERGE:
return deltas

return deltas

######################################################################
# { Classifier Trainer: megam
######################################################################

# [xx] possible extension: add support for using implicit file format;
# this would need to put requirements on what encoding is used.  But
# we may need this for other maxent classifier trainers that require
# implicit formats anyway.
[docs]def train_maxent_classifier_with_megam(
train_toks, trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, **kwargs
):
"""
Train a new ``ConditionalExponentialClassifier``, using the given
training samples, using the external ``megam`` library.  This
``ConditionalExponentialClassifier`` will encode the model that
maximizes entropy from all the models that are empirically
consistent with ``train_toks``.

:see: ``train_maxent_classifier()`` for parameter descriptions.
:see: ``nltk.classify.megam``
"""

explicit = True
bernoulli = True
if "explicit" in kwargs:
explicit = kwargs["explicit"]
if "bernoulli" in kwargs:
bernoulli = kwargs["bernoulli"]

# Construct an encoding from the training data.
if encoding is None:
# Count cutoff can also be controlled by megam with the -minfc
# option. Not sure where the best place for it is.
count_cutoff = kwargs.get("count_cutoff", 0)
encoding = BinaryMaxentFeatureEncoding.train(
train_toks, count_cutoff, labels=labels, alwayson_features=True
)
elif labels is not None:
raise ValueError("Specify encoding or labels, not both")

# Write a training file for megam.
try:
fd, trainfile_name = tempfile.mkstemp(prefix="nltk-")
with open(trainfile_name, "w") as trainfile:
write_megam_file(
train_toks, encoding, trainfile, explicit=explicit, bernoulli=bernoulli
)
os.close(fd)
except (OSError, ValueError) as e:
raise ValueError("Error while creating megam training file: %s" % e) from e

# Run megam on the training file.
options = []
options += ["-nobias", "-repeat", "10"]
if explicit:
options += ["-explicit"]
if not bernoulli:
options += ["-fvals"]
if gaussian_prior_sigma:
# Lambda is just the precision of the Gaussian prior, i.e. it's the
# inverse variance, so the parameter conversion is 1.0/sigma**2.
# See https://users.umiacs.umd.edu/~hal/docs/daume04cg-bfgs.pdf
inv_variance = 1.0 / gaussian_prior_sigma ** 2
else:
inv_variance = 0
options += ["-lambda", "%.2f" % inv_variance, "-tune"]
if trace < 3:
options += ["-quiet"]
if "max_iter" in kwargs:
options += ["-maxi", "%s" % kwargs["max_iter"]]
if "ll_delta" in kwargs:
# [xx] this is actually a perplexity delta, not a log
# likelihood delta
options += ["-dpp", "%s" % abs(kwargs["ll_delta"])]
if hasattr(encoding, "cost"):
options += ["-multilabel"]  # each possible la
options += ["multiclass", trainfile_name]
stdout = call_megam(options)
# print('./megam_i686.opt ', ' '.join(options))
# Delete the training file
try:
os.remove(trainfile_name)
except OSError as e:
print(f"Warning: unable to delete {trainfile_name}: {e}")

# Parse the generated weight vector.
weights = parse_megam_weights(stdout, encoding.length(), explicit)

# Convert from base-e to base-2 weights.
weights *= numpy.log2(numpy.e)

# Build the classifier
return MaxentClassifier(encoding, weights)

######################################################################
######################################################################

[docs]    @classmethod
def train(cls, train_toks, **kwargs):
algorithm = kwargs.get("algorithm", "tao_lmvm")
trace = kwargs.get("trace", 3)
encoding = kwargs.get("encoding", None)
labels = kwargs.get("labels", None)
sigma = kwargs.get("gaussian_prior_sigma", 0)
count_cutoff = kwargs.get("count_cutoff", 0)
max_iter = kwargs.get("max_iter")
ll_delta = kwargs.get("min_lldelta")

# Construct an encoding from the training data.
if not encoding:
train_toks, count_cutoff, labels=labels
)

trainfile_fd, trainfile_name = tempfile.mkstemp(
)

trainfile = gzip_open_unicode(trainfile_name, "w")
trainfile.close()

options = []
options.extend(["-monitor"])
options.extend(["-method", algorithm])
if sigma:
options.extend(["-l2", "%.6f" % sigma ** 2])
if max_iter:
options.extend(["-max_it", "%d" % max_iter])
if ll_delta:
options.extend(["-fatol", "%.6f" % abs(ll_delta)])
options.extend(["-events_in", trainfile_name])
options.extend(["-params_out", weightfile_name])
if trace < 3:
options.extend(["2>&1"])
else:
options.extend(["-summary"])

with open(weightfile_name) as weightfile:

os.remove(trainfile_name)
os.remove(weightfile_name)

# Convert from base-e to base-2 weights.
weights *= numpy.log2(numpy.e)

# Build the classifier
return cls(encoding, weights)

######################################################################
# { Demo
######################################################################
[docs]def demo():
from nltk.classify.util import names_demo

classifier = names_demo(MaxentClassifier.train)

if __name__ == "__main__":
demo()
```