Source code for

# Natural Language Toolkit: Regexp Chunk Parser Application
# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <>
# URL: <>
# For license information, see LICENSE.TXT

A graphical tool for exploring the regular expression based chunk
parser ``nltk.chunk.RegexpChunkParser``.

# Todo: Add a way to select the development set from the menubar.  This
# might just need to be a selection box (conll vs treebank etc) plus
# configuration parameters to select what's being chunked (eg VP vs NP)
# and what part of the data is being used as the development set.

from __future__ import division
import time
import textwrap
import re
import random

from six.moves.tkinter import (Button, Canvas, Checkbutton, Frame, IntVar,
                               Label, Menu, Scrollbar, Text, Tk)
from six.moves.tkinter_tkfiledialog import askopenfilename, asksaveasfilename
from six.moves.tkinter_font import Font

from nltk.tree import Tree
from nltk.util import in_idle
from nltk.draw.util import ShowText
from nltk.corpus import conll2000, treebank_chunk
from nltk.chunk import ChunkScore, RegexpChunkParser
from nltk.chunk.regexp import RegexpChunkRule

class RegexpChunkApp(object):
    A graphical tool for exploring the regular expression based chunk
    parser ``nltk.chunk.RegexpChunkParser``.

    See ``HELP`` for instructional text.

    ##  Help Text

    #: A dictionary mapping from part of speech tags to descriptions,
    #: which is used in the help text.  (This should probably live with
    #: the conll and/or treebank corpus instead.)
    TAGSET = {
        'CC':   'Coordinating conjunction',   'PRP$': 'Possessive pronoun',
        'CD':   'Cardinal number',            'RB':   'Adverb',
        'DT':   'Determiner',                 'RBR':  'Adverb, comparative',
        'EX':   'Existential there',          'RBS':  'Adverb, superlative',
        'FW':   'Foreign word',               'RP':   'Particle',
        'JJ':   'Adjective',                  'TO':   'to',
        'JJR':  'Adjective, comparative',     'UH':   'Interjection',
        'JJS':  'Adjective, superlative',     'VB':   'Verb, base form',
        'LS':   'List item marker',           'VBD':  'Verb, past tense',
        'MD':   'Modal',                      'NNS':  'Noun, plural',
        'NN':   'Noun, singular or masps',    'VBN':  'Verb, past participle',
        'VBZ':  'Verb,3rd ps. sing. present', 'NNP':  'Proper noun, singular',
        'NNPS': 'Proper noun plural',         'WDT':  'wh-determiner',
        'PDT':  'Predeterminer',              'WP':   'wh-pronoun',
        'POS':  'Possessive ending',          'WP$':  'Possessive wh-pronoun',
        'PRP':  'Personal pronoun',           'WRB':  'wh-adverb',
        '(':    'open parenthesis',           ')':    'close parenthesis',
        '``':   'open quote',                 ',':    'comma',
        "''":   'close quote',                '.':    'period',
        '#':    'pound sign (currency marker)',
        '$':    'dollar sign (currency marker)',
        'IN':   'Preposition/subord. conjunction',
        'SYM':  'Symbol (mathematical or scientific)',
        'VBG':  'Verb, gerund/present participle',
        'VBP':  'Verb, non-3rd ps. sing. present',
        ':':    'colon',

    #: Contents for the help box.  This is a list of tuples, one for
    #: each help page, where each tuple has four elements:
    #:   - A title (displayed as a tab)
    #:   - A string description of tabstops (see Tkinter.Text for details)
    #:   - The text contents for the help page.  You can use expressions
    #:     like <red>...</red> to colorize the text; see ``HELP_AUTOTAG``
    #:     for a list of tags you can use for colorizing.
    HELP = [
        ('Help', '20',
         "Welcome to the regular expression chunk-parser grammar editor.  "
         "You can use this editor to develop and test chunk parser grammars "
         "based on NLTK's RegexpChunkParser class.\n\n"
         # Help box.
         "Use this box ('Help') to learn more about the editor; click on the "
         "tabs for help on specific topics:"
         "Rules: grammar rule types\n"
         "Regexps: regular expression syntax\n"
         "Tags: part of speech tags\n</indent>\n"
         # Grammar.
         "Use the upper-left box ('Grammar') to edit your grammar.  "
         "Each line of your grammar specifies a single 'rule', "
         "which performs an action such as creating a chunk or merging "
         "two chunks.\n\n"
         # Dev set.
         "The lower-left box ('Development Set') runs your grammar on the "
         "development set, and displays the results.  "
         "Your grammar's chunks are <highlight>highlighted</highlight>, and "
         "the correct (gold standard) chunks are "
         "<underline>underlined</underline>.  If they "
         "match, they are displayed in <green>green</green>; otherwise, "
         "they are displayed in <red>red</red>.  The box displays a single "
         "sentence from the development set at a time; use the scrollbar or "
         "the next/previous buttons view additional sentences.\n\n"
         # Performance
         "The lower-right box ('Evaluation') tracks the performance of "
         "your grammar on the development set.  The 'precision' axis "
         "indicates how many of your grammar's chunks are correct; and "
         "the 'recall' axis indicates how many of the gold standard "
         "chunks your system generated.  Typically, you should try to "
         "design a grammar that scores high on both metrics.  The "
         "exact precision and recall of the current grammar, as well "
         "as their harmonic mean (the 'f-score'), are displayed in "
         "the status bar at the bottom of the window."
        ('Rules', '10',
         "<indent>\nChunk rule: creates new chunks from words matching "
         "<indent>\nChink rule: removes words matching regexp from existing "
         "<indent>\nSplit rule: splits chunks that match regexp1 followed by "
         "regexp2 in two.</indent>\n\n"
         "<indent>\nMerge rule: joins consecutive chunks that match regexp1 "
         "and regexp2</indent>\n"
        ('Regexps', '10 60',
         #"Regular Expression Syntax Summary:\n\n"
         "\t<<var>T</var>>\ta word with tag <var>T</var> "
         "(where <var>T</var> may be a regexp).\n"
         "\t<var>x</var>?\tan optional <var>x</var>\n"
         "\t<var>x</var>+\ta sequence of 1 or more <var>x</var>'s\n"
         "\t<var>x</var>*\ta sequence of 0 or more <var>x</var>'s\n"
         "\t<var>x</var>|<var>y</var>\t<var>x</var> or <var>y</var>\n"
         "\t.\tmatches any character\n"
         "\t(<var>x</var>)\tTreats <var>x</var> as a group\n"
         "\t# <var>x...</var>\tTreats <var>x...</var> "
         "(to the end of the line) as a comment\n"
         "\t\\<var>C</var>\tmatches character <var>C</var> "
         "(useful when <var>C</var> is a special character "
         "like + or #)\n"
         '\t\tMatches <match>"cow/NN"</match>\n'
         '\t\tMatches <match>"green/NN"</match>\n'
         '\t\tMatches <match>"eating/VBG"</match>\n'
         '\t\tMatches <match>"ate/VBD"</match>\n'
         '\t\tMatches <match>"on/IN the/DT car/NN"</match>\n'
         '\t\tMatches <match>"ran/VBD"</match>\n'
         '\t\tMatches <match>"slowly/RB ate/VBD"</match>\n'
         '\t<regexp><\#><CD> # This is a comment...</regexp>\n'
         '\t\tMatches <match>"#/# 100/CD"</match>\n'
        ('Tags', '10 60',
         "<h1>Part of Speech Tags:</h1>\n" +
         '<hangindent>' +
         '<<TAGSET>>' + # this gets auto-substituted w/ self.TAGSET

        ('red', dict(foreground='#a00')),
        ('green', dict(foreground='#080')),
        ('highlight', dict(background='#ddd')),
        ('underline', dict(underline=True)),
        ('h1', dict(underline=True)),
        ('indent', dict(lmargin1=20, lmargin2=20)),
        ('hangindent', dict(lmargin1=0, lmargin2=60)),
        ('var', dict(foreground='#88f')),
        ('regexp', dict(foreground='#ba7')),
        ('match', dict(foreground='#6a6')),

    ##  Config Parmeters

    _EVAL_DELAY = 1
    """If the user has not pressed any key for this amount of time (in
       seconds), and the current grammar has not been evaluated, then
       the eval demon will evaluate it."""

    _EVAL_CHUNK = 15
    """The number of sentences that should be evaluated by the eval
       demon each time it runs."""
    _EVAL_FREQ = 0.2
    """The frequency (in seconds) at which the eval demon is run"""
    _EVAL_DEMON_MIN = .02
    """The minimum amount of time that the eval demon should take each time
       it runs -- if it takes less than this time, _EVAL_CHUNK will be
       modified upwards."""
    _EVAL_DEMON_MAX = .04
    """The maximum amount of time that the eval demon should take each time
       it runs -- if it takes more than this time, _EVAL_CHUNK will be
       modified downwards."""

        width=40, height=12, background='#efe', highlightbackground='#efe',
        highlightthickness=1, relief='groove', border=2, wrap='word')
    _HELPBOX_PARAMS = dict(
        width=15, height=15, background='#efe', highlightbackground='#efe',
        highlightthickness=1, relief='groove', border=2, wrap='word')
        width=70, height=10, background='#eef', highlightbackground='#eef',
        highlightthickness=1, relief='groove', border=2, wrap='word',
    _STATUS_PARAMS = dict(
        background='#9bb', relief='groove', border=2)
    _FONT_PARAMS = dict(
        family='helvetica', size=-20)
    _FRAME_PARAMS = dict(
        background='#777', padx=2, pady=2, border=3)
    _EVALBOX_PARAMS = dict(
        background='#eef', highlightbackground='#eef',
        highlightthickness=1, relief='groove', border=2,
        width=300, height=280)
    _BUTTON_PARAMS = dict(
        background='#777', activebackground='#777',
    _HELPTAB_BG_COLOR = '#aba'
    _HELPTAB_FG_COLOR = '#efe'

    _HELPTAB_FG_PARAMS = dict(background='#efe')
    _HELPTAB_BG_PARAMS = dict(background='#aba')

    def normalize_grammar(self, grammar):
        # Strip comments
        grammar = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', grammar)
        # Normalize whitespace
        grammar = re.sub(' +', ' ', grammar)
        grammar = re.sub('\n\s+', '\n', grammar)
        grammar = grammar.strip()
        # [xx] Hack: automatically backslash $!
        grammar = re.sub(r'([^\\])\$', r'\1\\$', grammar)
        return grammar

    def __init__(self, devset_name='conll2000', devset=None,
                 grammar = '', chunk_label='NP', tagset=None):
        :param devset_name: The name of the development set; used for
            display & for save files.  If either the name 'treebank'
            or the name 'conll2000' is used, and devset is None, then
            devset will be set automatically.
        :param devset: A list of chunked sentences
        :param grammar: The initial grammar to display.
        :param tagset: Dictionary from tags to string descriptions, used
            for the help page.  Defaults to ``self.TAGSET``.
        self._chunk_label = chunk_label

        if tagset is None: tagset = self.TAGSET
        self.tagset = tagset

        # Named development sets:
        if devset is None:
            if devset_name == 'conll2000':
                devset = conll2000.chunked_sents('train.txt')#[:100]
            elif devset == 'treebank':
                devset = treebank_chunk.chunked_sents()#[:100]
                raise ValueError('Unknown development set %s' % devset_name)

        self.chunker = None
        """The chunker built from the grammar string"""

        self.grammar = grammar
        """The unparsed grammar string"""

        self.normalized_grammar = None
        """A normalized version of ``self.grammar``."""

        self.grammar_changed = 0
        """The last time() that the grammar was changed."""

        self.devset = devset
        """The development set -- a list of chunked sentences."""

        self.devset_name = devset_name
        """The name of the development set (for save files)."""

        self.devset_index = -1
        """The index into the development set of the first instance
           that's currently being viewed."""

        self._last_keypress = 0
        """The time() when a key was most recently pressed"""

        self._history = []
        """A list of (grammar, precision, recall, fscore) tuples for
           grammars that the user has already tried."""

        self._history_index = 0
        """When the user is scrolling through previous grammars, this
           is used to keep track of which grammar they're looking at."""

        self._eval_grammar = None
        """The grammar that is being currently evaluated by the eval

        self._eval_normalized_grammar = None
        """A normalized copy of ``_eval_grammar``."""

        self._eval_index = 0
        """The index of the next sentence in the development set that
           should be looked at by the eval demon."""

        self._eval_score = ChunkScore(chunk_label=chunk_label)
        """The ``ChunkScore`` object that's used to keep track of the score
        of the current grammar on the development set."""

        # Set up the main window.
        top = = Tk()
        top.title('Regexp Chunk Parser App')
        top.bind('<Control-q>', self.destroy)

        # Varaible that restricts how much of the devset we look at.
        self._devset_size = IntVar(top)

        # Set up all the tkinter widgets

        # If a grammar was given, then display it.
        if grammar:
            self.grammarbox.insert('end', grammar+'\n')
            self.grammarbox.mark_set('insert', '1.0')

        # Display the first item in the development set

    def _init_bindings(self, top):
        top.bind('<Control-n>', self._devset_next)
        top.bind('<Control-p>', self._devset_prev)
        top.bind('<Control-t>', self.toggle_show_trace)
        top.bind('<KeyPress>', self.update)
        top.bind('<Control-s>', lambda e: self.save_grammar())
        top.bind('<Control-o>', lambda e: self.load_grammar())
        self.grammarbox.bind('<Control-t>', self.toggle_show_trace)
        self.grammarbox.bind('<Control-n>', self._devset_next)
        self.grammarbox.bind('<Control-p>', self._devset_prev)

        # Redraw the eval graph when the window size changes
        self.evalbox.bind('<Configure>', self._eval_plot)

    def _init_fonts(self, top):
        # TWhat's our font size (default=same as sysfont)
        self._size = IntVar(top)
        self._font = Font(family='helvetica',
        self._smallfont = Font(family='helvetica',

    def _init_menubar(self, parent):
        menubar = Menu(parent)

        filemenu = Menu(menubar, tearoff=0)
        filemenu.add_command(label='Reset Application', underline=0,
        filemenu.add_command(label='Save Current Grammar', underline=0,
        filemenu.add_command(label='Load Grammar', underline=0,

        filemenu.add_command(label='Save Grammar History', underline=13,

        filemenu.add_command(label='Exit', underline=1,
                             command=self.destroy, accelerator='Ctrl-q')
        menubar.add_cascade(label='File', underline=0, menu=filemenu)

        viewmenu = Menu(menubar, tearoff=0)
        viewmenu.add_radiobutton(label='Tiny', variable=self._size,
                                 underline=0, value=10, command=self.resize)
        viewmenu.add_radiobutton(label='Small', variable=self._size,
                                 underline=0, value=16, command=self.resize)
        viewmenu.add_radiobutton(label='Medium', variable=self._size,
                                 underline=0, value=20, command=self.resize)
        viewmenu.add_radiobutton(label='Large', variable=self._size,
                                 underline=0, value=24, command=self.resize)
        viewmenu.add_radiobutton(label='Huge', variable=self._size,
                                 underline=0, value=34, command=self.resize)
        menubar.add_cascade(label='View', underline=0, menu=viewmenu)

        devsetmenu = Menu(menubar, tearoff=0)
        devsetmenu.add_radiobutton(label='50 sentences',
                                   value=50, command=self.set_devset_size)
        devsetmenu.add_radiobutton(label='100 sentences',
                                   value=100, command=self.set_devset_size)
        devsetmenu.add_radiobutton(label='200 sentences',
                                   value=200, command=self.set_devset_size)
        devsetmenu.add_radiobutton(label='500 sentences',
                                   value=500, command=self.set_devset_size)
        menubar.add_cascade(label='Development-Set', underline=0,

        helpmenu = Menu(menubar, tearoff=0)
        helpmenu.add_command(label='About', underline=0,
        menubar.add_cascade(label='Help', underline=0, menu=helpmenu)


    def toggle_show_trace(self, *e):
        if self._showing_trace:
        return 'break'

    _SCALE_N = 5 # center on the last 5 examples.
    _DRAW_LINES = False
    def _eval_plot(self, *e, **config):
        width = config.get('width', self.evalbox.winfo_width())
        height = config.get('height', self.evalbox.winfo_height())

        # Clear the canvas

        # Draw the precision & recall labels.
        tag = self.evalbox.create_text(10, height//2-10, justify='left',
                                 anchor='w', text='Precision')
        left, right = self.evalbox.bbox(tag)[2] + 5, width-10
        tag = self.evalbox.create_text(left + (width-left)//2, height-10,
                                anchor='s', text='Recall', justify='center')
        top, bot = 10, self.evalbox.bbox(tag)[1]-10

        # Draw masks for clipping the plot.
        bg = self._EVALBOX_PARAMS['background']
        self.evalbox.lower(self.evalbox.create_rectangle(0, 0, left-1, 5000,
                                                         fill=bg, outline=bg))
        self.evalbox.lower(self.evalbox.create_rectangle(0, bot+1, 5000, 5000,
                                                         fill=bg, outline=bg))

        # Calculate the plot's scale.
        if self._autoscale.get() and len(self._history) > 1:
            max_precision = max_recall = 0
            min_precision = min_recall = 1
            for i in range(1, min(len(self._history), self._SCALE_N+1)):
                grammar, precision, recall, fmeasure = self._history[-i]
                min_precision = min(precision, min_precision)
                min_recall = min(recall, min_recall)
                max_precision = max(precision, max_precision)
                max_recall = max(recall, max_recall)
#             if max_precision-min_precision > max_recall-min_recall:
#                 min_recall -= (max_precision-min_precision)/2
#                 max_recall += (max_precision-min_precision)/2
#             else:
#                 min_precision -= (max_recall-min_recall)/2
#                 max_precision += (max_recall-min_recall)/2
#             if min_recall < 0:
#                 max_recall -= min_recall
#                 min_recall = 0
#             if min_precision < 0:
#                 max_precision -= min_precision
#                 min_precision = 0
            min_precision = max(min_precision-.01, 0)
            min_recall = max(min_recall-.01, 0)
            max_precision = min(max_precision+.01, 1)
            max_recall = min(max_recall+.01, 1)
            min_precision = min_recall = 0
            max_precision = max_recall = 1

        # Draw the axis lines & grid lines
        for i in range(11):
            x = left + (right-left)*((i/10.-min_recall)/
            y = bot - (bot-top)*((i/10.-min_precision)/
            if left < x < right:
                self.evalbox.create_line(x, top, x, bot, fill='#888')
            if top < y < bot:
                self.evalbox.create_line(left, y, right, y, fill='#888')
        self.evalbox.create_line(left, top, left, bot)
        self.evalbox.create_line(left, bot, right, bot)

        # Display the plot's scale
            left-3, bot, justify='right', anchor='se',
            text='%d%%' % (100*min_precision))
            left-3, top, justify='right', anchor='ne',
            text='%d%%' % (100*max_precision))
            left, bot+3, justify='center', anchor='nw',
            text='%d%%' % (100*min_recall))
            right, bot+3, justify='center', anchor='ne',
            text='%d%%' % (100*max_recall))

        # Display the scores.
        prev_x = prev_y = None
        for i, (_, precision, recall, fscore) in enumerate(self._history):
            x = left + (right-left) * ((recall-min_recall) /
            y = bot - (bot-top) * ((precision-min_precision) /
            if i == self._history_index:
                                         fill='#0f0', outline='#000')
                self.status['text'] = (
                    'Precision: %.2f%%\t' % (precision*100)+
                    'Recall: %.2f%%\t' % (recall*100)+
                    'F-score: %.2f%%' % (fscore*100))
                                             fill='#afa', outline='#8c8'))
            if prev_x is not None and self._eval_lines.get():
                    self.evalbox.create_line(prev_x, prev_y, x, y,
            prev_x, prev_y = x, y

    _eval_demon_running = False
    def _eval_demon(self):
        if is None: return
        if self.chunker is None:
            self._eval_demon_running = False

        # Note our starting time.
        t0 = time.time()

        # If are still typing, then wait for them to finish.
        if (time.time()-self._last_keypress < self._EVAL_DELAY and
            self.normalized_grammar != self._eval_normalized_grammar):
            self._eval_demon_running = True
            return*1000), self._eval_demon)

        # If the grammar changed, restart the evaluation.
        if self.normalized_grammar != self._eval_normalized_grammar:
            # Check if we've seen this grammar already.  If so, then
            # just use the old evaluation values.
            for (g, p, r, f) in self._history:
                if self.normalized_grammar == self.normalize_grammar(g):
                    self._history.append( (g, p, r, f) )
                    self._history_index = len(self._history) - 1
                    self._eval_demon_running = False
                    self._eval_normalized_grammar = None
            self._eval_index = 0
            self._eval_score = ChunkScore(chunk_label=self._chunk_label)
            self._eval_grammar = self.grammar
            self._eval_normalized_grammar = self.normalized_grammar

        # If the grammar is empty, the don't bother evaluating it, or
        # recording it in history -- the score will just be 0.
        if self.normalized_grammar.strip() == '':
            #self._eval_index = self._devset_size.get()
            self._eval_demon_running = False

        # Score the next set of examples
        for gold in self.devset[self._eval_index:
            guess = self._chunkparse(gold.leaves())
            self._eval_score.score(gold, guess)

        # update our index in the devset.
        self._eval_index += self._EVAL_CHUNK

        # Check if we're done
        if self._eval_index >= self._devset_size.get():
            self._history.append( (self._eval_grammar,
                                   self._eval_score.f_measure()) )
            self._history_index = len(self._history)-1
            self._eval_demon_running = False
            self._eval_normalized_grammar = None
            progress = 100*self._eval_index/self._devset_size.get()
            self.status['text'] = ('Evaluating on Development Set (%d%%)' %
            self._eval_demon_running = True
            self._adaptively_modify_eval_chunk(time.time() - t0)
  *1000), self._eval_demon)

    def _adaptively_modify_eval_chunk(self, t):
        Modify _EVAL_CHUNK to try to keep the amount of time that the
        eval demon takes between _EVAL_DEMON_MIN and _EVAL_DEMON_MAX.

        :param t: The amount of time that the eval demon took.
        if t > self._EVAL_DEMON_MAX and self._EVAL_CHUNK > 5:
            self._EVAL_CHUNK = min(self._EVAL_CHUNK-1,
        elif t < self._EVAL_DEMON_MIN:
            self._EVAL_CHUNK = max(self._EVAL_CHUNK+1,

    def _init_widgets(self, top):
        frame0 = Frame(top, **self._FRAME_PARAMS)
        frame0.grid_columnconfigure(0, weight=4)
        frame0.grid_columnconfigure(3, weight=2)
        frame0.grid_rowconfigure(1, weight=1)
        frame0.grid_rowconfigure(5, weight=1)

        # The grammar
        self.grammarbox = Text(frame0, font=self._font,
        self.grammarlabel = Label(frame0, font=self._font, text='Grammar:',
        self.grammarlabel.grid(column=0, row=0, sticky='SW')
        self.grammarbox.grid(column=0, row=1, sticky='NEWS')

        # Scroll bar for grammar
        grammar_scrollbar = Scrollbar(frame0, command=self.grammarbox.yview)
        grammar_scrollbar.grid(column=1, row=1, sticky='NWS')

        # grammar buttons
        bg = self._FRAME_PARAMS['background']
        frame3 = Frame(frame0, background=bg)
        frame3.grid(column=0, row=2, sticky='EW')
        Button(frame3, text='Prev Grammar', command=self._history_prev,
        Button(frame3, text='Next Grammar', command=self._history_next,

        # Help box
        self.helpbox = Text(frame0, font=self._smallfont,
        self.helpbox.grid(column=3, row=1, sticky='NEWS')
        self.helptabs = {}
        bg = self._FRAME_PARAMS['background']
        helptab_frame = Frame(frame0, background=bg)
        helptab_frame.grid(column=3, row=0, sticky='SW')
        for i, (tab, tabstops, text) in enumerate(self.HELP):
            label = Label(helptab_frame, text=tab, font=self._smallfont)
            label.grid(column=i*2, row=0, sticky='S')
            #help_frame.grid_columnconfigure(i, weight=1)
            label.bind('<ButtonPress>', lambda e, tab=tab: self.show_help(tab))
            self.helptabs[tab] = label
            Frame(helptab_frame, height=1, width=self._HELPTAB_SPACER,
                  background=bg).grid(column=i*2+1, row=0)
        self.helpbox.tag_config('elide', elide=True)
        for (tag, params) in self.HELP_AUTOTAG:
            self.helpbox.tag_config('tag-%s' % tag, **params)

        # Scroll bar for helpbox
        help_scrollbar = Scrollbar(frame0, command=self.helpbox.yview)
        help_scrollbar.grid(column=4, row=1, sticky='NWS')

        # The dev set
        frame4 = Frame(frame0, background=self._FRAME_PARAMS['background'])
        self.devsetbox = Text(frame4, font=self._font,
        self.devsetbox.pack(expand=True, fill='both')
        self.devsetlabel = Label(frame0, font=self._font,
                      text='Development Set:', justify='right',
        self.devsetlabel.grid(column=0, row=4, sticky='SW')
        frame4.grid(column=0, row=5, sticky='NEWS')

        # dev set scrollbars
        self.devset_scroll = Scrollbar(frame0, command=self._devset_scroll)
        self.devset_scroll.grid(column=1, row=5, sticky='NWS')
        self.devset_xscroll = Scrollbar(frame4, command=self.devsetbox.xview,
        self.devsetbox['xscrollcommand'] = self.devset_xscroll.set
        self.devset_xscroll.pack(side='bottom', fill='x')

        # dev set buttons
        bg = self._FRAME_PARAMS['background']
        frame1 = Frame(frame0, background=bg)
        frame1.grid(column=0, row=7, sticky='EW')
        Button(frame1, text='Prev Example (Ctrl-p)',
        Button(frame1, text='Next Example (Ctrl-n)',
        self.devset_button = Button(frame1, text='Show example',
        self.trace_button = Button(frame1, text='Show trace',

        # evaluation box
        self.evalbox = Canvas(frame0, **self._EVALBOX_PARAMS)
        label = Label(frame0, font=self._font, text='Evaluation:',
              justify='right', background=self._EVALBOX_PARAMS['background'])
        label.grid(column=3, row=4, sticky='SW')
        self.evalbox.grid(column=3, row=5, sticky='NEWS', columnspan=2)

        # evaluation box buttons
        bg = self._FRAME_PARAMS['background']
        frame2 = Frame(frame0, background=bg)
        frame2.grid(column=3, row=7, sticky='EW')
        self._autoscale = IntVar(
        Checkbutton(frame2, variable=self._autoscale, command=self._eval_plot,
                    text='Zoom', **self._BUTTON_PARAMS).pack(side='left')
        self._eval_lines = IntVar(
        Checkbutton(frame2, variable=self._eval_lines, command=self._eval_plot,
                    text='Lines', **self._BUTTON_PARAMS).pack(side='left')
        Button(frame2, text='History',

        # The status label
        self.status = Label(frame0, font=self._font, **self._STATUS_PARAMS)
        self.status.grid(column=0, row=9, sticky='NEW', padx=3, pady=2,

        # Help box & devset box can't be edited.
        self.helpbox['state'] = 'disabled'
        self.devsetbox['state'] = 'disabled'

        # Spacers
        bg = self._FRAME_PARAMS['background']
        Frame(frame0, height=10, width=0, background=bg).grid(column=0, row=3)
        Frame(frame0, height=0, width=10, background=bg).grid(column=2, row=0)
        Frame(frame0, height=6, width=0, background=bg).grid(column=0, row=8)

        # pack the frame.
        frame0.pack(fill='both', expand=True)

        # Set up colors for the devset box
        self.devsetbox.tag_config('true-pos', background='#afa',
        self.devsetbox.tag_config('false-neg', underline='True',
        self.devsetbox.tag_config('false-pos', background='#faa')
        self.devsetbox.tag_config('trace', foreground='#666', wrap='none')
        self.devsetbox.tag_config('wrapindent', lmargin2=30, wrap='none')
        self.devsetbox.tag_config('error', foreground='#800')

        # And for the grammarbox
        self.grammarbox.tag_config('error', background='#fec')
        self.grammarbox.tag_config('comment', foreground='#840')
        self.grammarbox.tag_config('angle', foreground='#00f')
        self.grammarbox.tag_config('brace', foreground='#0a0')
        self.grammarbox.tag_config('hangindent', lmargin1=0, lmargin2=40)

    _showing_trace = False
    def show_trace(self, *e):
        self._showing_trace = True
        self.trace_button['state'] = 'disabled'
        self.devset_button['state'] = 'normal'

        self.devsetbox['state'] = 'normal'
        #self.devsetbox['wrap'] = 'none'
        self.devsetbox.delete('1.0', 'end')
        self.devsetlabel['text']='Development Set (%d/%d)' % (
            (self.devset_index+1, self._devset_size.get()))

        if self.chunker is None:
            self.devsetbox.insert('1.0', 'Trace: waiting for a valid grammar.')
            self.devsetbox.tag_add('error', '1.0', 'end')
            return # can't do anything more

        gold_tree = self.devset[self.devset_index]
        rules = self.chunker.rules()

        # Calculate the tag sequence
        tagseq = '\t'
        charnum = [1]
        for wordnum, (word, pos) in enumerate(gold_tree.leaves()):
            tagseq += '%s ' % pos
        self.charnum = dict(((i, j), charnum[j])
                            for i in range(len(rules)+1)
                            for j in range(len(charnum)))
        self.linenum = dict((i,i*2+2) for i in range(len(rules)+1))

        for i in range(len(rules)+1):
            if i == 0:
                self.devsetbox.insert('end', 'Start:\n')
                self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
                self.devsetbox.insert('end', 'Apply %s:\n' % rules[i-1])
                self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
            # Display the tag sequence.
            self.devsetbox.insert('end', tagseq+'\n')
            self.devsetbox.tag_add('wrapindent','end -2c linestart','end -2c')
            # Run a partial parser, and extract gold & test chunks
            chunker = RegexpChunkParser(rules[:i])
            test_tree = self._chunkparse(gold_tree.leaves())
            gold_chunks = self._chunks(gold_tree)
            test_chunks = self._chunks(test_tree)
            # Compare them.
            for chunk in gold_chunks.intersection(test_chunks):
                self._color_chunk(i, chunk, 'true-pos')
            for chunk in gold_chunks - test_chunks:
                self._color_chunk(i, chunk, 'false-neg')
            for chunk in test_chunks - gold_chunks:
                self._color_chunk(i, chunk, 'false-pos')
        self.devsetbox.insert('end', 'Finished.\n')
        self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')

        # This is a hack, because the x-scrollbar isn't updating its
        # position right -- I'm not sure what the underlying cause is
        # though.  (This is on OS X w/ python 2.5), self.devset_xscroll.set, 0, .3)

    def show_help(self, tab):
        self.helpbox['state'] = 'normal'
        self.helpbox.delete('1.0', 'end')
        for (name, tabstops, text) in self.HELP:
            if name == tab:
                text = text.replace('<<TAGSET>>', '\n'.join(
                    ('\t%s\t%s' % item for item in sorted(list(self.tagset.items()),
                    key=lambda t_w:re.match('\w+',t_w[0]) and (0,t_w[0]) or (1,t_w[0])))))

                self.helpbox.insert('1.0', text+'\n'*20)
                C = '1.0 + %d chars'
                for (tag, params) in self.HELP_AUTOTAG:
                    pattern = '(?s)(<%s>)(.*?)(</%s>)' % (tag, tag)
                    for m in re.finditer(pattern, text):
                                             C % m.start(1), C % m.end(1))
                        self.helpbox.tag_add('tag-%s' % tag,
                                             C % m.start(2), C % m.end(2))
                                             C % m.start(3), C % m.end(3))
        self.helpbox['state'] = 'disabled'

    def _history_prev(self, *e):
        return 'break'

    def _history_next(self, *e):
        return 'break'

    def _view_history(self, index):
        # Bounds & sanity checking:
        index = max(0, min(len(self._history)-1, index))
        if not self._history: return
        # Already viewing the requested history item?
        if index == self._history_index:
        # Show the requested grammar.  It will get added to _history
        # only if they edit it (causing self.update() to get run.)
        self.grammarbox['state'] = 'normal'
        self.grammarbox.delete('1.0', 'end')
        self.grammarbox.insert('end', self._history[index][0])
        self.grammarbox.mark_set('insert', '1.0')
        self._history_index = index
        # Record the normalized grammar & regenerate the chunker.
        self.normalized_grammar = self.normalize_grammar(
        if self.normalized_grammar:
            rules = [RegexpChunkRule.fromstring(line)
                     for line in self.normalized_grammar.split('\n')]
            rules = []
        self.chunker = RegexpChunkParser(rules)
        # Show the score.
        # Update the devset box
        if self._showing_trace: self.show_trace()
        # Update the grammar label
        if self._history_index < len(self._history)-1:
            self.grammarlabel['text'] = 'Grammar %s/%s:' % (
                self._history_index+1, len(self._history))
            self.grammarlabel['text'] = 'Grammar:'

    def _devset_next(self, *e):
        self._devset_scroll('scroll', 1, 'page')
        return 'break'

    def _devset_prev(self, *e):
        self._devset_scroll('scroll', -1, 'page')
        return 'break'

    def destroy(self, *e):
        if is None: return = None

    def _devset_scroll(self, command, *args):
        N = 1 # size of a page -- one sentence.
        showing_trace = self._showing_trace
        if command == 'scroll' and args[1].startswith('unit'):
        elif command == 'scroll' and args[1].startswith('page'):
        elif command == 'moveto':
            assert 0, 'bad scroll command %s %s' % (command, args)
        if showing_trace:

    def show_devset(self, index=None):
        if index is None: index = self.devset_index

        # Bounds checking
        index = min(max(0, index), self._devset_size.get()-1)

        if index == self.devset_index and not self._showing_trace: return
        self.devset_index = index

        self._showing_trace = False
        self.trace_button['state'] = 'normal'
        self.devset_button['state'] = 'disabled'

        # Clear the text box.
        self.devsetbox['state'] = 'normal'
        self.devsetbox['wrap'] = 'word'
        self.devsetbox.delete('1.0', 'end')
        self.devsetlabel['text']='Development Set (%d/%d)' % (
            (self.devset_index+1, self._devset_size.get()))

        # Add the sentences
        sample = self.devset[self.devset_index:self.devset_index+1]
        self.charnum = {}
        self.linenum = {0:1}
        for sentnum, sent in enumerate(sample):
            linestr = ''
            for wordnum, (word, pos) in enumerate(sent.leaves()):
                self.charnum[sentnum, wordnum] = len(linestr)
                linestr += '%s/%s ' % (word, pos)
                self.charnum[sentnum, wordnum+1] = len(linestr)
            self.devsetbox.insert('end', linestr[:-1]+'\n\n')

        # Highlight chunks in the dev set
        if self.chunker is not None:
        self.devsetbox['state'] = 'disabled'

        # Update the scrollbar
        first = self.devset_index/self._devset_size.get()
        last = (self.devset_index + 2) / self._devset_size.get()
        self.devset_scroll.set(first, last)

    def _chunks(self, tree):
        chunks = set()
        wordnum = 0
        for child in tree:
            if isinstance(child, Tree):
                if child.label() == self._chunk_label:
                    chunks.add( (wordnum, wordnum+len(child)) )
                wordnum += len(child)
                wordnum += 1
        return chunks

    def _syntax_highlight_grammar(self, grammar):
        if is None: return
        self.grammarbox.tag_remove('comment', '1.0', 'end')
        self.grammarbox.tag_remove('angle', '1.0', 'end')
        self.grammarbox.tag_remove('brace', '1.0', 'end')
        self.grammarbox.tag_add('hangindent', '1.0', 'end')
        for lineno, line in enumerate(grammar.split('\n')):
            if not line.strip(): continue
            m = re.match(r'(\\.|[^#])*(#.*)?', line)
            comment_start = None
                comment_start = m.start(2)
                s = '%d.%d' % (lineno+1, m.start(2))
                e = '%d.%d' % (lineno+1, m.end(2))
                self.grammarbox.tag_add('comment', s, e)
            for m in re.finditer('[<>{}]', line):
                if comment_start is not None and m.start() >= comment_start:
                s = '%d.%d' % (lineno+1, m.start())
                e = '%d.%d' % (lineno+1, m.end())
                if in '<>':
                    self.grammarbox.tag_add('angle', s, e)
                    self.grammarbox.tag_add('brace', s, e)

    def _grammarcheck(self, grammar):
        if is None: return
        self.grammarbox.tag_remove('error', '1.0', 'end')
        self._grammarcheck_errs = []
        for lineno, line in enumerate(grammar.split('\n')):
            line = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', line)
            line = line.strip()
            if line:
                except ValueError as e:
                    self.grammarbox.tag_add('error', '%s.0' % (lineno+1),
                                            '%s.0 lineend' % (lineno+1))
        self.status['text'] = ''

    def update(self, *event):
        # Record when update was called (for grammarcheck)
        if event:
            self._last_keypress = time.time()

        # Read the grammar from the Text box.
        self.grammar = grammar = self.grammarbox.get('1.0', 'end')

        # If the grammar hasn't changed, do nothing:
        normalized_grammar = self.normalize_grammar(grammar)
        if normalized_grammar == self.normalized_grammar:
            self.normalized_grammar = normalized_grammar

        # If the grammar has changed, and we're looking at history,
        # then stop looking at history.
        if self._history_index < len(self._history)-1:
            self.grammarlabel['text'] = 'Grammar:'


        # The grammar has changed; try parsing it.  If it doesn't
        # parse, do nothing.  (flag error location?)
            # Note: the normalized grammar has no blank lines.
            if normalized_grammar:
                rules = [RegexpChunkRule.fromstring(line)
                         for line in normalized_grammar.split('\n')]
                rules = []
        except ValueError as e:
            # Use the un-normalized grammar for error highlighting.
            self.chunker = None

        self.chunker = RegexpChunkParser(rules)
        self.grammarbox.tag_remove('error', '1.0', 'end')
        self.grammar_changed = time.time()
        # Display the results
        if self._showing_trace:
        # Start the eval demon
        if not self._eval_demon_running:

    def _highlight_devset(self, sample=None):
        if sample is None:
            sample = self.devset[self.devset_index:self.devset_index+1]

        self.devsetbox.tag_remove('true-pos', '1.0', 'end')
        self.devsetbox.tag_remove('false-neg', '1.0', 'end')
        self.devsetbox.tag_remove('false-pos', '1.0', 'end')

        # Run the grammar on the test cases.
        for sentnum, gold_tree in enumerate(sample):
            # Run the chunk parser
            test_tree = self._chunkparse(gold_tree.leaves())
            # Extract gold & test chunks
            gold_chunks = self._chunks(gold_tree)
            test_chunks = self._chunks(test_tree)
            # Compare them.
            for chunk in gold_chunks.intersection(test_chunks):
                self._color_chunk(sentnum, chunk, 'true-pos')
            for chunk in gold_chunks - test_chunks:
                self._color_chunk(sentnum, chunk, 'false-neg')
            for chunk in test_chunks - gold_chunks:
                self._color_chunk(sentnum, chunk, 'false-pos')

    def _chunkparse(self, words):
            return self.chunker.parse(words)
        except (ValueError, IndexError) as e:
            # There's an error somewhere in the grammar, but we're not sure
            # exactly where, so just mark the whole grammar as bad.
            # E.g., this is caused by: "({<NN>})"
            self.grammarbox.tag_add('error', '1.0', 'end')
            # Treat it as tagging nothing:
            return words

    def _color_chunk(self, sentnum, chunk, tag):
        start, end = chunk
            '%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, start]),
            '%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, end]-1))

    def reset(self):
        # Clear various variables
        self.chunker = None
        self.grammar = None
        self.normalized_grammar = None
        self.grammar_changed = 0
        self._history = []
        self._history_index = 0
        # Update the on-screen display.
        self.grammarbox.delete('1.0', 'end')

        '# Regexp Chunk Parsing Grammar\n'
        '# Saved %(date)s\n'
        '# Development set: %(devset)s\n'
        '#   Precision: %(precision)s\n'
        '#   Recall:    %(recall)s\n'
        '#   F-score:   %(fscore)s\n\n'

    def save_grammar(self, filename=None):
        if not filename:
            ftypes = [('Chunk Gramamr', '.chunk'),
                      ('All files', '*')]
            filename = asksaveasfilename(filetypes=ftypes,
            if not filename: return
        if (self._history and self.normalized_grammar ==
            precision, recall, fscore = ['%.2f%%' % (100*v) for v in
        elif self.chunker is None:
            precision = recall = fscore = 'Grammar not well formed'
            precision = recall = fscore = 'Not finished evaluation yet'

        with open(filename, 'w') as outfile:
            outfile.write(self.SAVE_GRAMMAR_TEMPLATE % dict(
                date=time.ctime(), devset=self.devset_name,
                precision=precision, recall=recall, fscore=fscore,

    def load_grammar(self, filename=None):
        if not filename:
            ftypes = [('Chunk Gramamr', '.chunk'),
                      ('All files', '*')]
            filename = askopenfilename(filetypes=ftypes,
            if not filename: return
        self.grammarbox.delete('1.0', 'end')
        with open(filename, 'r') as infile:
            grammar =
        grammar = re.sub('^\# Regexp Chunk Parsing Grammar[\s\S]*'
                         'F-score:.*\n', '', grammar).lstrip()
        self.grammarbox.insert('1.0', grammar)

    def save_history(self, filename=None):
        if not filename:
            ftypes = [('Chunk Gramamr History', '.txt'),
                      ('All files', '*')]
            filename = asksaveasfilename(filetypes=ftypes,
            if not filename: return

        with open(filename, 'w') as outfile:
            outfile.write('# Regexp Chunk Parsing Grammar History\n')
            outfile.write('# Saved %s\n' % time.ctime())
            outfile.write('# Development set: %s\n' % self.devset_name)
            for i, (g, p, r, f) in enumerate(self._history):
                hdr = ('Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, '
                       'fscore=%.2f%%)' % (i+1, len(self._history),
                                           p*100, r*100, f*100))
                outfile.write('\n%s\n' % hdr)
                outfile.write(''.join('  %s\n' % line for line in g.strip().split()))

            if not (self._history and self.normalized_grammar ==
                if self.chunker is None:
                    outfile.write('\nCurrent Grammar (not well-formed)\n')
                    outfile.write('\nCurrent Grammar (not evaluated)\n')
                outfile.write(''.join('  %s\n' % line for line
                                  in self.grammar.strip().split()))

    def about(self, *e):
        ABOUT = ("NLTK RegExp Chunk Parser Application\n"+
                 "Written by Edward Loper")
        TITLE = 'About: Regular Expression Chunk Parser Application'
            from six.moves.tkinter_messagebox import Message
            Message(message=ABOUT, title=TITLE).show()
            ShowText(, TITLE, ABOUT)

    def set_devset_size(self, size=None):
        if size is not None: self._devset_size.set(size)
        self._devset_size.set(min(len(self.devset), self._devset_size.get()))
        # what about history?  Evaluated at diff dev set sizes!

    def resize(self, size=None):
        if size is not None: self._size.set(size)
        size = self._size.get()
        self._smallfont.configure(size=min(-10, -(abs(size))*14//20))

    def mainloop(self, *args, **kwargs):
        Enter the Tkinter mainloop.  This function must be called if
        this demo is created from a non-interactive program (e.g.
        from a secript); otherwise, the demo will close as soon as
        the script completes.
        if in_idle(): return*args, **kwargs)

[docs]def app(): RegexpChunkApp().mainloop()
if __name__ == '__main__': app() __all__ = ['app']