# Natural Language Toolkit: Transformation-based learning
#
# Copyright (C) 2001-2024 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from collections import Counter, defaultdict
from nltk import jsontags
from nltk.tag import TaggerI
from nltk.tbl import Feature, Template
######################################################################
# Brill Templates
######################################################################
[docs]
@jsontags.register_tag
class Word(Feature):
"""
Feature which examines the text (word) of nearby tokens.
"""
json_tag = "nltk.tag.brill.Word"
[docs]
@jsontags.register_tag
class Pos(Feature):
"""
Feature which examines the tags of nearby tokens.
"""
json_tag = "nltk.tag.brill.Pos"
[docs]
def nltkdemo18():
"""
Return 18 templates, from the original nltk demo, in multi-feature syntax
"""
return [
Template(Pos([-1])),
Template(Pos([1])),
Template(Pos([-2])),
Template(Pos([2])),
Template(Pos([-2, -1])),
Template(Pos([1, 2])),
Template(Pos([-3, -2, -1])),
Template(Pos([1, 2, 3])),
Template(Pos([-1]), Pos([1])),
Template(Word([-1])),
Template(Word([1])),
Template(Word([-2])),
Template(Word([2])),
Template(Word([-2, -1])),
Template(Word([1, 2])),
Template(Word([-3, -2, -1])),
Template(Word([1, 2, 3])),
Template(Word([-1]), Word([1])),
]
[docs]
def nltkdemo18plus():
"""
Return 18 templates, from the original nltk demo, and additionally a few
multi-feature ones (the motivation is easy comparison with nltkdemo18)
"""
return nltkdemo18() + [
Template(Word([-1]), Pos([1])),
Template(Pos([-1]), Word([1])),
Template(Word([-1]), Word([0]), Pos([1])),
Template(Pos([-1]), Word([0]), Word([1])),
Template(Pos([-1]), Word([0]), Pos([1])),
]
[docs]
def fntbl37():
"""
Return 37 templates taken from the postagging task of the
fntbl distribution https://www.cs.jhu.edu/~rflorian/fntbl/
(37 is after excluding a handful which do not condition on Pos[0];
fntbl can do that but the current nltk implementation cannot.)
"""
return [
Template(Word([0]), Word([1]), Word([2])),
Template(Word([-1]), Word([0]), Word([1])),
Template(Word([0]), Word([-1])),
Template(Word([0]), Word([1])),
Template(Word([0]), Word([2])),
Template(Word([0]), Word([-2])),
Template(Word([1, 2])),
Template(Word([-2, -1])),
Template(Word([1, 2, 3])),
Template(Word([-3, -2, -1])),
Template(Word([0]), Pos([2])),
Template(Word([0]), Pos([-2])),
Template(Word([0]), Pos([1])),
Template(Word([0]), Pos([-1])),
Template(Word([0])),
Template(Word([-2])),
Template(Word([2])),
Template(Word([1])),
Template(Word([-1])),
Template(Pos([-1]), Pos([1])),
Template(Pos([1]), Pos([2])),
Template(Pos([-1]), Pos([-2])),
Template(Pos([1])),
Template(Pos([-1])),
Template(Pos([-2])),
Template(Pos([2])),
Template(Pos([1, 2, 3])),
Template(Pos([1, 2])),
Template(Pos([-3, -2, -1])),
Template(Pos([-2, -1])),
Template(Pos([1]), Word([0]), Word([1])),
Template(Pos([1]), Word([0]), Word([-1])),
Template(Pos([-1]), Word([-1]), Word([0])),
Template(Pos([-1]), Word([0]), Word([1])),
Template(Pos([-2]), Pos([-1])),
Template(Pos([1]), Pos([2])),
Template(Pos([1]), Pos([2]), Word([1])),
]
[docs]
def brill24():
"""
Return 24 templates of the seminal TBL paper, Brill (1995)
"""
return [
Template(Pos([-1])),
Template(Pos([1])),
Template(Pos([-2])),
Template(Pos([2])),
Template(Pos([-2, -1])),
Template(Pos([1, 2])),
Template(Pos([-3, -2, -1])),
Template(Pos([1, 2, 3])),
Template(Pos([-1]), Pos([1])),
Template(Pos([-2]), Pos([-1])),
Template(Pos([1]), Pos([2])),
Template(Word([-1])),
Template(Word([1])),
Template(Word([-2])),
Template(Word([2])),
Template(Word([-2, -1])),
Template(Word([1, 2])),
Template(Word([-1, 0])),
Template(Word([0, 1])),
Template(Word([0])),
Template(Word([-1]), Pos([-1])),
Template(Word([1]), Pos([1])),
Template(Word([0]), Word([-1]), Pos([-1])),
Template(Word([0]), Word([1]), Pos([1])),
]
[docs]
def describe_template_sets():
"""
Print the available template sets in this demo, with a short description"
"""
import inspect
import sys
# a bit of magic to get all functions in this module
templatesets = inspect.getmembers(sys.modules[__name__], inspect.isfunction)
for name, obj in templatesets:
if name == "describe_template_sets":
continue
print(name, obj.__doc__, "\n")
######################################################################
# The Brill Tagger
######################################################################
[docs]
@jsontags.register_tag
class BrillTagger(TaggerI):
"""
Brill's transformational rule-based tagger. Brill taggers use an
initial tagger (such as ``tag.DefaultTagger``) to assign an initial
tag sequence to a text; and then apply an ordered list of
transformational rules to correct the tags of individual tokens.
These transformation rules are specified by the ``TagRule``
interface.
Brill taggers can be created directly, from an initial tagger and
a list of transformational rules; but more often, Brill taggers
are created by learning rules from a training corpus, using one
of the TaggerTrainers available.
"""
json_tag = "nltk.tag.BrillTagger"
[docs]
def __init__(self, initial_tagger, rules, training_stats=None):
"""
:param initial_tagger: The initial tagger
:type initial_tagger: TaggerI
:param rules: An ordered list of transformation rules that
should be used to correct the initial tagging.
:type rules: list(TagRule)
:param training_stats: A dictionary of statistics collected
during training, for possible later use
:type training_stats: dict
"""
self._initial_tagger = initial_tagger
self._rules = tuple(rules)
self._training_stats = training_stats
[docs]
def encode_json_obj(self):
return self._initial_tagger, self._rules, self._training_stats
[docs]
@classmethod
def decode_json_obj(cls, obj):
_initial_tagger, _rules, _training_stats = obj
return cls(_initial_tagger, _rules, _training_stats)
[docs]
def rules(self):
"""
Return the ordered list of transformation rules that this tagger has learnt
:return: the ordered list of transformation rules that correct the initial tagging
:rtype: list of Rules
"""
return self._rules
[docs]
def train_stats(self, statistic=None):
"""
Return a named statistic collected during training, or a dictionary of all
available statistics if no name given
:param statistic: name of statistic
:type statistic: str
:return: some statistic collected during training of this tagger
:rtype: any (but usually a number)
"""
if statistic is None:
return self._training_stats
else:
return self._training_stats.get(statistic)
[docs]
def tag(self, tokens):
# Inherit documentation from TaggerI
# Run the initial tagger.
tagged_tokens = self._initial_tagger.tag(tokens)
# Create a dictionary that maps each tag to a list of the
# indices of tokens that have that tag.
tag_to_positions = defaultdict(set)
for i, (token, tag) in enumerate(tagged_tokens):
tag_to_positions[tag].add(i)
# Apply each rule, in order. Only try to apply rules at
# positions that have the desired original tag.
for rule in self._rules:
# Find the positions where it might apply
positions = tag_to_positions.get(rule.original_tag, [])
# Apply the rule at those positions.
changed = rule.apply(tagged_tokens, positions)
# Update tag_to_positions with the positions of tags that
# were modified.
for i in changed:
tag_to_positions[rule.original_tag].remove(i)
tag_to_positions[rule.replacement_tag].add(i)
return tagged_tokens
[docs]
def print_template_statistics(self, test_stats=None, printunused=True):
"""
Print a list of all templates, ranked according to efficiency.
If test_stats is available, the templates are ranked according to their
relative contribution (summed for all rules created from a given template,
weighted by score) to the performance on the test set. If no test_stats, then
statistics collected during training are used instead. There is also
an unweighted measure (just counting the rules). This is less informative,
though, as many low-score rules will appear towards end of training.
:param test_stats: dictionary of statistics collected during testing
:type test_stats: dict of str -> any (but usually numbers)
:param printunused: if True, print a list of all unused templates
:type printunused: bool
:return: None
:rtype: None
"""
tids = [r.templateid for r in self._rules]
train_stats = self.train_stats()
trainscores = train_stats["rulescores"]
assert len(trainscores) == len(
tids
), "corrupt statistics: " "{} train scores for {} rules".format(
trainscores, tids
)
template_counts = Counter(tids)
weighted_traincounts = Counter()
for tid, score in zip(tids, trainscores):
weighted_traincounts[tid] += score
tottrainscores = sum(trainscores)
# det_tplsort() is for deterministic sorting;
# the otherwise convenient Counter.most_common() unfortunately
# does not break ties deterministically
# between python versions and will break cross-version tests
def det_tplsort(tpl_value):
return (tpl_value[1], repr(tpl_value[0]))
def print_train_stats():
print(
"TEMPLATE STATISTICS (TRAIN) {} templates, {} rules)".format(
len(template_counts), len(tids)
)
)
print(
"TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
"final: {finalerrors:5d} {finalacc:.4f}".format(**train_stats)
)
head = "#ID | Score (train) | #Rules | Template"
print(head, "\n", "-" * len(head), sep="")
train_tplscores = sorted(
weighted_traincounts.items(), key=det_tplsort, reverse=True
)
for tid, trainscore in train_tplscores:
s = "{} | {:5d} {:5.3f} |{:4d} {:.3f} | {}".format(
tid,
trainscore,
trainscore / tottrainscores,
template_counts[tid],
template_counts[tid] / len(tids),
Template.ALLTEMPLATES[int(tid)],
)
print(s)
def print_testtrain_stats():
testscores = test_stats["rulescores"]
print(
"TEMPLATE STATISTICS (TEST AND TRAIN) ({} templates, {} rules)".format(
len(template_counts), len(tids)
)
)
print(
"TEST ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
"final: {finalerrors:5d} {finalacc:.4f} ".format(**test_stats)
)
print(
"TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
"final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats)
)
weighted_testcounts = Counter()
for tid, score in zip(tids, testscores):
weighted_testcounts[tid] += score
tottestscores = sum(testscores)
head = "#ID | Score (test) | Score (train) | #Rules | Template"
print(head, "\n", "-" * len(head), sep="")
test_tplscores = sorted(
weighted_testcounts.items(), key=det_tplsort, reverse=True
)
for tid, testscore in test_tplscores:
s = "{:s} |{:5d} {:6.3f} | {:4d} {:.3f} |{:4d} {:.3f} | {:s}".format(
tid,
testscore,
testscore / tottestscores,
weighted_traincounts[tid],
weighted_traincounts[tid] / tottrainscores,
template_counts[tid],
template_counts[tid] / len(tids),
Template.ALLTEMPLATES[int(tid)],
)
print(s)
def print_unused_templates():
usedtpls = {int(tid) for tid in tids}
unused = [
(tid, tpl)
for (tid, tpl) in enumerate(Template.ALLTEMPLATES)
if tid not in usedtpls
]
print(f"UNUSED TEMPLATES ({len(unused)})")
for tid, tpl in unused:
print(f"{tid:03d} {str(tpl):s}")
if test_stats is None:
print_train_stats()
else:
print_testtrain_stats()
print()
if printunused:
print_unused_templates()
print()
[docs]
def batch_tag_incremental(self, sequences, gold):
"""
Tags by applying each rule to the entire corpus (rather than all rules to a
single sequence). The point is to collect statistics on the test set for
individual rules.
NOTE: This is inefficient (does not build any index, so will traverse the entire
corpus N times for N rules) -- usually you would not care about statistics for
individual rules and thus use batch_tag() instead
:param sequences: lists of token sequences (sentences, in some applications) to be tagged
:type sequences: list of list of strings
:param gold: the gold standard
:type gold: list of list of strings
:returns: tuple of (tagged_sequences, ordered list of rule scores (one for each rule))
"""
def counterrors(xs):
return sum(t[1] != g[1] for pair in zip(xs, gold) for (t, g) in zip(*pair))
testing_stats = {}
testing_stats["tokencount"] = sum(len(t) for t in sequences)
testing_stats["sequencecount"] = len(sequences)
tagged_tokenses = [self._initial_tagger.tag(tokens) for tokens in sequences]
testing_stats["initialerrors"] = counterrors(tagged_tokenses)
testing_stats["initialacc"] = (
1 - testing_stats["initialerrors"] / testing_stats["tokencount"]
)
# Apply each rule to the entire corpus, in order
errors = [testing_stats["initialerrors"]]
for rule in self._rules:
for tagged_tokens in tagged_tokenses:
rule.apply(tagged_tokens)
errors.append(counterrors(tagged_tokenses))
testing_stats["rulescores"] = [
err0 - err1 for (err0, err1) in zip(errors, errors[1:])
]
testing_stats["finalerrors"] = errors[-1]
testing_stats["finalacc"] = (
1 - testing_stats["finalerrors"] / testing_stats["tokencount"]
)
return (tagged_tokenses, testing_stats)