Source code for nltk.chunk.util

# Natural Language Toolkit: Chunk format conversions
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
#         Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

import re

from nltk.metrics import accuracy as _accuracy
from nltk.tag.mapping import map_tag
from nltk.tag.util import str2tuple
from nltk.tree import Tree

##//////////////////////////////////////////////////////
## EVALUATION
##//////////////////////////////////////////////////////


[docs]def accuracy(chunker, gold): """ Score the accuracy of the chunker against the gold standard. Strip the chunk information from the gold standard and rechunk it using the chunker, then compute the accuracy score. :type chunker: ChunkParserI :param chunker: The chunker being evaluated. :type gold: tree :param gold: The chunk structures to score the chunker on. :rtype: float """ gold_tags = [] test_tags = [] for gold_tree in gold: test_tree = chunker.parse(gold_tree.flatten()) gold_tags += tree2conlltags(gold_tree) test_tags += tree2conlltags(test_tree) # print 'GOLD:', gold_tags[:50] # print 'TEST:', test_tags[:50] return _accuracy(gold_tags, test_tags)
# Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13 # -- statistics are evaluated only on demand, instead of at every sentence evaluation # # SB: use nltk.metrics for precision/recall scoring? #
[docs]class ChunkScore: """ A utility class for scoring chunk parsers. ``ChunkScore`` can evaluate a chunk parser's output, based on a number of statistics (precision, recall, f-measure, misssed chunks, incorrect chunks). It can also combine the scores from the parsing of multiple texts; this makes it significantly easier to evaluate a chunk parser that operates one sentence at a time. Texts are evaluated with the ``score`` method. The results of evaluation can be accessed via a number of accessor methods, such as ``precision`` and ``f_measure``. A typical use of the ``ChunkScore`` class is:: >>> chunkscore = ChunkScore() # doctest: +SKIP >>> for correct in correct_sentences: # doctest: +SKIP ... guess = chunkparser.parse(correct.leaves()) # doctest: +SKIP ... chunkscore.score(correct, guess) # doctest: +SKIP >>> print('F Measure:', chunkscore.f_measure()) # doctest: +SKIP F Measure: 0.823 :ivar kwargs: Keyword arguments: - max_tp_examples: The maximum number actual examples of true positives to record. This affects the ``correct`` member function: ``correct`` will not return more than this number of true positive examples. This does *not* affect any of the numerical metrics (precision, recall, or f-measure) - max_fp_examples: The maximum number actual examples of false positives to record. This affects the ``incorrect`` member function and the ``guessed`` member function: ``incorrect`` will not return more than this number of examples, and ``guessed`` will not return more than this number of true positive examples. This does *not* affect any of the numerical metrics (precision, recall, or f-measure) - max_fn_examples: The maximum number actual examples of false negatives to record. This affects the ``missed`` member function and the ``correct`` member function: ``missed`` will not return more than this number of examples, and ``correct`` will not return more than this number of true negative examples. This does *not* affect any of the numerical metrics (precision, recall, or f-measure) - chunk_label: A regular expression indicating which chunks should be compared. Defaults to ``'.*'`` (i.e., all chunks). :type _tp: list(Token) :ivar _tp: List of true positives :type _fp: list(Token) :ivar _fp: List of false positives :type _fn: list(Token) :ivar _fn: List of false negatives :type _tp_num: int :ivar _tp_num: Number of true positives :type _fp_num: int :ivar _fp_num: Number of false positives :type _fn_num: int :ivar _fn_num: Number of false negatives. """
[docs] def __init__(self, **kwargs): self._correct = set() self._guessed = set() self._tp = set() self._fp = set() self._fn = set() self._max_tp = kwargs.get("max_tp_examples", 100) self._max_fp = kwargs.get("max_fp_examples", 100) self._max_fn = kwargs.get("max_fn_examples", 100) self._chunk_label = kwargs.get("chunk_label", ".*") self._tp_num = 0 self._fp_num = 0 self._fn_num = 0 self._count = 0 self._tags_correct = 0.0 self._tags_total = 0.0 self._measuresNeedUpdate = False
def _updateMeasures(self): if self._measuresNeedUpdate: self._tp = self._guessed & self._correct self._fn = self._correct - self._guessed self._fp = self._guessed - self._correct self._tp_num = len(self._tp) self._fp_num = len(self._fp) self._fn_num = len(self._fn) self._measuresNeedUpdate = False
[docs] def score(self, correct, guessed): """ Given a correctly chunked sentence, score another chunked version of the same sentence. :type correct: chunk structure :param correct: The known-correct ("gold standard") chunked sentence. :type guessed: chunk structure :param guessed: The chunked sentence to be scored. """ self._correct |= _chunksets(correct, self._count, self._chunk_label) self._guessed |= _chunksets(guessed, self._count, self._chunk_label) self._count += 1 self._measuresNeedUpdate = True # Keep track of per-tag accuracy (if possible) try: correct_tags = tree2conlltags(correct) guessed_tags = tree2conlltags(guessed) except ValueError: # This exception case is for nested chunk structures, # where tree2conlltags will fail with a ValueError: "Tree # is too deeply nested to be printed in CoNLL format." correct_tags = guessed_tags = () self._tags_total += len(correct_tags) self._tags_correct += sum( 1 for (t, g) in zip(guessed_tags, correct_tags) if t == g )
[docs] def accuracy(self): """ Return the overall tag-based accuracy for all text that have been scored by this ``ChunkScore``, using the IOB (conll2000) tag encoding. :rtype: float """ if self._tags_total == 0: return 1 return self._tags_correct / self._tags_total
[docs] def precision(self): """ Return the overall precision for all texts that have been scored by this ``ChunkScore``. :rtype: float """ self._updateMeasures() div = self._tp_num + self._fp_num if div == 0: return 0 else: return self._tp_num / div
[docs] def recall(self): """ Return the overall recall for all texts that have been scored by this ``ChunkScore``. :rtype: float """ self._updateMeasures() div = self._tp_num + self._fn_num if div == 0: return 0 else: return self._tp_num / div
[docs] def f_measure(self, alpha=0.5): """ Return the overall F measure for all texts that have been scored by this ``ChunkScore``. :param alpha: the relative weighting of precision and recall. Larger alpha biases the score towards the precision value, while smaller alpha biases the score towards the recall value. ``alpha`` should have a value in the range [0,1]. :type alpha: float :rtype: float """ self._updateMeasures() p = self.precision() r = self.recall() if p == 0 or r == 0: # what if alpha is 0 or 1? return 0 return 1 / (alpha / p + (1 - alpha) / r)
[docs] def missed(self): """ Return the chunks which were included in the correct chunk structures, but not in the guessed chunk structures, listed in input order. :rtype: list of chunks """ self._updateMeasures() chunks = list(self._fn) return [c[1] for c in chunks] # discard position information
[docs] def incorrect(self): """ Return the chunks which were included in the guessed chunk structures, but not in the correct chunk structures, listed in input order. :rtype: list of chunks """ self._updateMeasures() chunks = list(self._fp) return [c[1] for c in chunks] # discard position information
[docs] def correct(self): """ Return the chunks which were included in the correct chunk structures, listed in input order. :rtype: list of chunks """ chunks = list(self._correct) return [c[1] for c in chunks] # discard position information
[docs] def guessed(self): """ Return the chunks which were included in the guessed chunk structures, listed in input order. :rtype: list of chunks """ chunks = list(self._guessed) return [c[1] for c in chunks] # discard position information
def __len__(self): self._updateMeasures() return self._tp_num + self._fn_num def __repr__(self): """ Return a concise representation of this ``ChunkScoring``. :rtype: str """ return "<ChunkScoring of " + repr(len(self)) + " chunks>" def __str__(self): """ Return a verbose representation of this ``ChunkScoring``. This representation includes the precision, recall, and f-measure scores. For other information about the score, use the accessor methods (e.g., ``missed()`` and ``incorrect()``). :rtype: str """ return ( "ChunkParse score:\n" + (f" IOB Accuracy: {self.accuracy() * 100:5.1f}%%\n") + (f" Precision: {self.precision() * 100:5.1f}%%\n") + (f" Recall: {self.recall() * 100:5.1f}%%\n") + (f" F-Measure: {self.f_measure() * 100:5.1f}%%") )
# extract chunks, and assign unique id, the absolute position of # the first word of the chunk def _chunksets(t, count, chunk_label): pos = 0 chunks = [] for child in t: if isinstance(child, Tree): if re.match(chunk_label, child.label()): chunks.append(((count, pos), child.freeze())) pos += len(child.leaves()) else: pos += 1 return set(chunks)
[docs]def tagstr2tree( s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None ): """ Divide a string of bracketted tagged text into chunks and unchunked tokens, and produce a Tree. Chunks are marked by square brackets (``[...]``). Words are delimited by whitespace, and each word should have the form ``text/tag``. Words that do not contain a slash are assigned a ``tag`` of None. :param s: The string to be converted :type s: str :param chunk_label: The label to use for chunk nodes :type chunk_label: str :param root_label: The label to use for the root of the tree :type root_label: str :rtype: Tree """ WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+") stack = [Tree(root_label, [])] for match in WORD_OR_BRACKET.finditer(s): text = match.group() if text[0] == "[": if len(stack) != 1: raise ValueError(f"Unexpected [ at char {match.start():d}") chunk = Tree(chunk_label, []) stack[-1].append(chunk) stack.append(chunk) elif text[0] == "]": if len(stack) != 2: raise ValueError(f"Unexpected ] at char {match.start():d}") stack.pop() else: if sep is None: stack[-1].append(text) else: word, tag = str2tuple(text, sep) if source_tagset and target_tagset: tag = map_tag(source_tagset, target_tagset, tag) stack[-1].append((word, tag)) if len(stack) != 1: raise ValueError(f"Expected ] at char {len(s):d}") return stack[0]
### CONLL _LINE_RE = re.compile(r"(\S+)\s+(\S+)\s+([IOB])-?(\S+)?")
[docs]def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"): """ Return a chunk structure for a single sentence encoded in the given CONLL 2000 style string. This function converts a CoNLL IOB string into a tree. It uses the specified chunk types (defaults to NP, PP and VP), and creates a tree rooted at a node labeled S (by default). :param s: The CoNLL string to be converted. :type s: str :param chunk_types: The chunk types to be converted. :type chunk_types: tuple :param root_label: The node label to use for the root. :type root_label: str :rtype: Tree """ stack = [Tree(root_label, [])] for lineno, line in enumerate(s.split("\n")): if not line.strip(): continue # Decode the line. match = _LINE_RE.match(line) if match is None: raise ValueError(f"Error on line {lineno:d}") (word, tag, state, chunk_type) = match.groups() # If it's a chunk type we don't care about, treat it as O. if chunk_types is not None and chunk_type not in chunk_types: state = "O" # For "Begin"/"Outside", finish any completed chunks - # also do so for "Inside" which don't match the previous token. mismatch_I = state == "I" and chunk_type != stack[-1].label() if state in "BO" or mismatch_I: if len(stack) == 2: stack.pop() # For "Begin", start a new chunk. if state == "B" or mismatch_I: chunk = Tree(chunk_type, []) stack[-1].append(chunk) stack.append(chunk) # Add the new word token. stack[-1].append((word, tag)) return stack[0]
[docs]def tree2conlltags(t): """ Return a list of 3-tuples containing ``(word, tag, IOB-tag)``. Convert a tree to the CoNLL IOB tag format. :param t: The tree to be converted. :type t: Tree :rtype: list(tuple) """ tags = [] for child in t: try: category = child.label() prefix = "B-" for contents in child: if isinstance(contents, Tree): raise ValueError( "Tree is too deeply nested to be printed in CoNLL format" ) tags.append((contents[0], contents[1], prefix + category)) prefix = "I-" except AttributeError: tags.append((child[0], child[1], "O")) return tags
[docs]def conlltags2tree( sentence, chunk_types=("NP", "PP", "VP"), root_label="S", strict=False ): """ Convert the CoNLL IOB format to a tree. """ tree = Tree(root_label, []) for (word, postag, chunktag) in sentence: if chunktag is None: if strict: raise ValueError("Bad conll tag sequence") else: # Treat as O tree.append((word, postag)) elif chunktag.startswith("B-"): tree.append(Tree(chunktag[2:], [(word, postag)])) elif chunktag.startswith("I-"): if ( len(tree) == 0 or not isinstance(tree[-1], Tree) or tree[-1].label() != chunktag[2:] ): if strict: raise ValueError("Bad conll tag sequence") else: # Treat as B-* tree.append(Tree(chunktag[2:], [(word, postag)])) else: tree[-1].append((word, postag)) elif chunktag == "O": tree.append((word, postag)) else: raise ValueError(f"Bad conll tag {chunktag!r}") return tree
[docs]def tree2conllstr(t): """ Return a multiline string where each line contains a word, tag and IOB tag. Convert a tree to the CoNLL IOB string format :param t: The tree to be converted. :type t: Tree :rtype: str """ lines = [" ".join(token) for token in tree2conlltags(t)] return "\n".join(lines)
### IEER _IEER_DOC_RE = re.compile( r"<DOC>\s*" r"(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?" r"(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?" r"(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?" r"<BODY>\s*" r"(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?" r"<TEXT>(?P<text>.*?)</TEXT>\s*" r"</BODY>\s*</DOC>\s*", re.DOTALL, ) _IEER_TYPE_RE = re.compile(r'<b_\w+\s+[^>]*?type="(?P<type>\w+)"') def _ieer_read_text(s, root_label): stack = [Tree(root_label, [])] # s will be None if there is no headline in the text # return the empty list in place of a Tree if s is None: return [] for piece_m in re.finditer(r"<[^>]+>|[^\s<]+", s): piece = piece_m.group() try: if piece.startswith("<b_"): m = _IEER_TYPE_RE.match(piece) if m is None: print("XXXX", piece) chunk = Tree(m.group("type"), []) stack[-1].append(chunk) stack.append(chunk) elif piece.startswith("<e_"): stack.pop() # elif piece.startswith('<'): # print "ERROR:", piece # raise ValueError # Unexpected HTML else: stack[-1].append(piece) except (IndexError, ValueError) as e: raise ValueError( f"Bad IEER string (error at character {piece_m.start():d})" ) from e if len(stack) != 1: raise ValueError("Bad IEER string") return stack[0]
[docs]def ieerstr2tree( s, chunk_types=[ "LOCATION", "ORGANIZATION", "PERSON", "DURATION", "DATE", "CARDINAL", "PERCENT", "MONEY", "MEASURE", ], root_label="S", ): """ Return a chunk structure containing the chunked tagged text that is encoded in the given IEER style string. Convert a string of chunked tagged text in the IEER named entity format into a chunk structure. Chunks are of several types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL, PERCENT, MONEY, and MEASURE. :rtype: Tree """ # Try looking for a single document. If that doesn't work, then just # treat everything as if it was within the <TEXT>...</TEXT>. m = _IEER_DOC_RE.match(s) if m: return { "text": _ieer_read_text(m.group("text"), root_label), "docno": m.group("docno"), "doctype": m.group("doctype"), "date_time": m.group("date_time"), #'headline': m.group('headline') # we want to capture NEs in the headline too! "headline": _ieer_read_text(m.group("headline"), root_label), } else: return _ieer_read_text(s, root_label)
[docs]def demo(): s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./." import nltk t = nltk.chunk.tagstr2tree(s, chunk_label="NP") t.pprint() print() s = """ These DT B-NP research NN I-NP protocols NNS I-NP offer VBP B-VP to TO B-PP the DT B-NP patient NN I-NP not RB O only RB O the DT B-NP very RB I-NP best JJS I-NP therapy NN I-NP which WDT B-NP we PRP B-NP have VBP B-VP established VBN I-VP today NN B-NP but CC B-NP also RB I-NP the DT B-NP hope NN I-NP of IN B-PP something NN B-NP still RB B-ADJP better JJR I-ADJP . . O """ conll_tree = conllstr2tree(s, chunk_types=("NP", "PP")) conll_tree.pprint() # Demonstrate CoNLL output print("CoNLL output:") print(nltk.chunk.tree2conllstr(conll_tree)) print()
if __name__ == "__main__": demo()