Source code for nltk.classify.megam

# Natural Language Toolkit: Interface to Megam Classifier
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

"""
A set of functions used to interface with the external megam_ maxent
optimization package. Before megam can be used, you should tell NLTK where it
can find the megam binary, using the ``config_megam()`` function. Typical
usage:

    >>> from nltk.classify import megam
    >>> megam.config_megam() # pass path to megam if not found in PATH # doctest: +SKIP
    [Found megam: ...]

Use with MaxentClassifier. Example below, see MaxentClassifier documentation
for details.

    nltk.classify.MaxentClassifier.train(corpus, 'megam')

.. _megam: https://www.umiacs.umd.edu/~hal/megam/index.html
"""
import subprocess

from nltk.internals import find_binary

try:
    import numpy
except ImportError:
    numpy = None

######################################################################
# { Configuration
######################################################################

_megam_bin = None


[docs]def config_megam(bin=None): """ Configure NLTK's interface to the ``megam`` maxent optimization package. :param bin: The full path to the ``megam`` binary. If not specified, then nltk will search the system for a ``megam`` binary; and if one is not found, it will raise a ``LookupError`` exception. :type bin: str """ global _megam_bin _megam_bin = find_binary( "megam", bin, env_vars=["MEGAM"], binary_names=["megam.opt", "megam", "megam_686", "megam_i686.opt"], url="https://www.umiacs.umd.edu/~hal/megam/index.html", )
###################################################################### # { Megam Interface Functions ######################################################################
[docs]def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True): """ Generate an input file for ``megam`` based on the given corpus of classified tokens. :type train_toks: list(tuple(dict, str)) :param train_toks: Training data, represented as a list of pairs, the first member of which is a feature dictionary, and the second of which is a classification label. :type encoding: MaxentFeatureEncodingI :param encoding: A feature encoding, used to convert featuresets into feature vectors. May optionally implement a cost() method in order to assign different costs to different class predictions. :type stream: stream :param stream: The stream to which the megam input file should be written. :param bernoulli: If true, then use the 'bernoulli' format. I.e., all joint features have binary values, and are listed iff they are true. Otherwise, list feature values explicitly. If ``bernoulli=False``, then you must call ``megam`` with the ``-fvals`` option. :param explicit: If true, then use the 'explicit' format. I.e., list the features that would fire for any of the possible labels, for each token. If ``explicit=True``, then you must call ``megam`` with the ``-explicit`` option. """ # Look up the set of labels. labels = encoding.labels() labelnum = {label: i for (i, label) in enumerate(labels)} # Write the file, which contains one line per instance. for featureset, label in train_toks: # First, the instance number (or, in the weighted multiclass case, the cost of each label). if hasattr(encoding, "cost"): stream.write( ":".join(str(encoding.cost(featureset, label, l)) for l in labels) ) else: stream.write("%d" % labelnum[label]) # For implicit file formats, just list the features that fire # for this instance's actual label. if not explicit: _write_megam_features(encoding.encode(featureset, label), stream, bernoulli) # For explicit formats, list the features that would fire for # any of the possible labels. else: for l in labels: stream.write(" #") _write_megam_features(encoding.encode(featureset, l), stream, bernoulli) # End of the instance. stream.write("\n")
[docs]def parse_megam_weights(s, features_count, explicit=True): """ Given the stdout output generated by ``megam`` when training a model, return a ``numpy`` array containing the corresponding weight vector. This function does not currently handle bias features. """ if numpy is None: raise ValueError("This function requires that numpy be installed") assert explicit, "non-explicit not supported yet" lines = s.strip().split("\n") weights = numpy.zeros(features_count, "d") for line in lines: if line.strip(): fid, weight = line.split() weights[int(fid)] = float(weight) return weights
def _write_megam_features(vector, stream, bernoulli): if not vector: raise ValueError( "MEGAM classifier requires the use of an " "always-on feature." ) for (fid, fval) in vector: if bernoulli: if fval == 1: stream.write(" %s" % fid) elif fval != 0: raise ValueError( "If bernoulli=True, then all" "features must be binary." ) else: stream.write(f" {fid} {fval}")
[docs]def call_megam(args): """ Call the ``megam`` binary with the given arguments. """ if isinstance(args, str): raise TypeError("args should be a list of strings") if _megam_bin is None: config_megam() # Call megam via a subprocess cmd = [_megam_bin] + args p = subprocess.Popen(cmd, stdout=subprocess.PIPE) (stdout, stderr) = p.communicate() # Check the return code. if p.returncode != 0: print() print(stderr) raise OSError("megam command failed!") if isinstance(stdout, str): return stdout else: return stdout.decode("utf-8")