Source code for nltk.app.collocations_app

# Natural Language Toolkit: Collocations Application
# Much of the GUI code is imported from concordance.py; We intend to merge these tools together
# Copyright (C) 2001-2019 NLTK Project
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
#


from __future__ import division

import threading

from six.moves import queue as q
from six.moves.tkinter_font import Font
from six.moves.tkinter import (
    Button,
    END,
    Frame,
    IntVar,
    LEFT,
    Label,
    Menu,
    OptionMenu,
    SUNKEN,
    Scrollbar,
    StringVar,
    Text,
    Tk,
)

from nltk.corpus import (
    cess_cat,
    brown,
    nps_chat,
    treebank,
    sinica_treebank,
    alpino,
    indian,
    floresta,
    mac_morpho,
    machado,
    cess_esp,
)
from nltk.util import in_idle
from nltk.probability import FreqDist


CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'
POLL_INTERVAL = 100

_DEFAULT = 'English: Brown Corpus (Humor)'
_CORPORA = {
    'Catalan: CESS-CAT Corpus': lambda: cess_cat.words(),
    'English: Brown Corpus': lambda: brown.words(),
    'English: Brown Corpus (Press)': lambda: brown.words(
        categories=['news', 'editorial', 'reviews']
    ),
    'English: Brown Corpus (Religion)': lambda: brown.words(categories='religion'),
    'English: Brown Corpus (Learned)': lambda: brown.words(categories='learned'),
    'English: Brown Corpus (Science Fiction)': lambda: brown.words(
        categories='science_fiction'
    ),
    'English: Brown Corpus (Romance)': lambda: brown.words(categories='romance'),
    'English: Brown Corpus (Humor)': lambda: brown.words(categories='humor'),
    'English: NPS Chat Corpus': lambda: nps_chat.words(),
    'English: Wall Street Journal Corpus': lambda: treebank.words(),
    'Chinese: Sinica Corpus': lambda: sinica_treebank.words(),
    'Dutch: Alpino Corpus': lambda: alpino.words(),
    'Hindi: Indian Languages Corpus': lambda: indian.words(files='hindi.pos'),
    'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.words(),
    'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.words(),
    'Portuguese: Machado Corpus (Brazil)': lambda: machado.words(),
    'Spanish: CESS-ESP Corpus': lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = '#FFF'  # white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()
        self._init_top(self.top)
        self._init_menubar()
        self._init_widgets(self.top)
        self.load_corpus(self.model.DEFAULT_CORPUS)
        self.after = self.top.after(POLL_INTERVAL, self._poll)

    def _init_top(self, top):
        top.geometry('550x650+50+50')
        top.title('NLTK Collocations List')
        top.bind('<Control-q>', self.destroy)
        top.protocol('WM_DELETE_WINDOW', self.destroy)
        top.minsize(550, 650)

    def _init_widgets(self, parent):
        self.main_frame = Frame(
            parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)
        )
        self._init_corpus_select(self.main_frame)
        self._init_results_box(self.main_frame)
        self._init_paging(self.main_frame)
        self._init_status(self.main_frame)
        self.main_frame.pack(fill='both', expand=True)

    def _init_corpus_select(self, parent):
        innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
        self.var = StringVar(innerframe)
        self.var.set(self.model.DEFAULT_CORPUS)
        Label(
            innerframe,
            justify=LEFT,
            text=' Corpus: ',
            background=self._BACKGROUND_COLOUR,
            padx=2,
            pady=1,
            border=0,
        ).pack(side='left')

        other_corpora = list(self.model.CORPORA.keys()).remove(
            self.model.DEFAULT_CORPUS
        )
        om = OptionMenu(
            innerframe,
            self.var,
            self.model.DEFAULT_CORPUS,
            command=self.corpus_selected,
            *self.model.non_default_corpora()
        )
        om['borderwidth'] = 0
        om['highlightthickness'] = 1
        om.pack(side='left')
        innerframe.pack(side='top', fill='x', anchor='n')

    def _init_status(self, parent):
        self.status = Label(
            parent,
            justify=LEFT,
            relief=SUNKEN,
            background=self._BACKGROUND_COLOUR,
            border=0,
            padx=1,
            pady=0,
        )
        self.status.pack(side='top', anchor='sw')

    def _init_menubar(self):
        self._result_size = IntVar(self.top)
        menubar = Menu(self.top)

        filemenu = Menu(menubar, tearoff=0, borderwidth=0)
        filemenu.add_command(
            label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-q'
        )
        menubar.add_cascade(label='File', underline=0, menu=filemenu)

        editmenu = Menu(menubar, tearoff=0)
        rescntmenu = Menu(editmenu, tearoff=0)
        rescntmenu.add_radiobutton(
            label='20',
            variable=self._result_size,
            underline=0,
            value=20,
            command=self.set_result_size,
        )
        rescntmenu.add_radiobutton(
            label='50',
            variable=self._result_size,
            underline=0,
            value=50,
            command=self.set_result_size,
        )
        rescntmenu.add_radiobutton(
            label='100',
            variable=self._result_size,
            underline=0,
            value=100,
            command=self.set_result_size,
        )
        rescntmenu.invoke(1)
        editmenu.add_cascade(label='Result Count', underline=0, menu=rescntmenu)

        menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
        self.top.config(menu=menubar)

    def set_result_size(self, **kwargs):
        self.model.result_count = self._result_size.get()

    def _init_results_box(self, parent):
        innerframe = Frame(parent)
        i1 = Frame(innerframe)
        i2 = Frame(innerframe)
        vscrollbar = Scrollbar(i1, borderwidth=1)
        hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz')
        self.results_box = Text(
            i1,
            font=Font(family='courier', size='16'),
            state='disabled',
            borderwidth=1,
            yscrollcommand=vscrollbar.set,
            xscrollcommand=hscrollbar.set,
            wrap='none',
            width='40',
            height='20',
            exportselection=1,
        )
        self.results_box.pack(side='left', fill='both', expand=True)
        vscrollbar.pack(side='left', fill='y', anchor='e')
        vscrollbar.config(command=self.results_box.yview)
        hscrollbar.pack(side='left', fill='x', expand=True, anchor='w')
        hscrollbar.config(command=self.results_box.xview)
        # there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
        Label(i2, text='   ', background=self._BACKGROUND_COLOUR).pack(
            side='left', anchor='e'
        )
        i1.pack(side='top', fill='both', expand=True, anchor='n')
        i2.pack(side='bottom', fill='x', anchor='s')
        innerframe.pack(side='top', fill='both', expand=True)

    def _init_paging(self, parent):
        innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
        self.prev = prev = Button(
            innerframe,
            text='Previous',
            command=self.previous,
            width='10',
            borderwidth=1,
            highlightthickness=1,
            state='disabled',
        )
        prev.pack(side='left', anchor='center')
        self.next = next = Button(
            innerframe,
            text='Next',
            command=self.__next__,
            width='10',
            borderwidth=1,
            highlightthickness=1,
            state='disabled',
        )
        next.pack(side='right', anchor='center')
        innerframe.pack(side='top', fill='y')
        self.reset_current_page()

    def reset_current_page(self):
        self.current_page = -1

    def _poll(self):
        try:
            event = self.queue.get(block=False)
        except q.Empty:
            pass
        else:
            if event == CORPUS_LOADED_EVENT:
                self.handle_corpus_loaded(event)
            elif event == ERROR_LOADING_CORPUS_EVENT:
                self.handle_error_loading_corpus(event)
        self.after = self.top.after(POLL_INTERVAL, self._poll)

    def handle_error_loading_corpus(self, event):
        self.status['text'] = 'Error in loading ' + self.var.get()
        self.unfreeze_editable()
        self.clear_results_box()
        self.freeze_editable()
        self.reset_current_page()

    def handle_corpus_loaded(self, event):
        self.status['text'] = self.var.get() + ' is loaded'
        self.unfreeze_editable()
        self.clear_results_box()
        self.reset_current_page()
        # self.next()
        collocations = self.model.next(self.current_page + 1)
        self.write_results(collocations)
        self.current_page += 1

    def corpus_selected(self, *args):
        new_selection = self.var.get()
        self.load_corpus(new_selection)

    def previous(self):
        self.freeze_editable()
        collocations = self.model.prev(self.current_page - 1)
        self.current_page = self.current_page - 1
        self.clear_results_box()
        self.write_results(collocations)
        self.unfreeze_editable()

    def __next__(self):
        self.freeze_editable()
        collocations = self.model.next(self.current_page + 1)
        self.clear_results_box()
        self.write_results(collocations)
        self.current_page += 1
        self.unfreeze_editable()

    def load_corpus(self, selection):
        if self.model.selected_corpus != selection:
            self.status['text'] = 'Loading ' + selection + '...'
            self.freeze_editable()
            self.model.load_corpus(selection)

    def freeze_editable(self):
        self.prev['state'] = 'disabled'
        self.next['state'] = 'disabled'

    def clear_results_box(self):
        self.results_box['state'] = 'normal'
        self.results_box.delete("1.0", END)
        self.results_box['state'] = 'disabled'

    def fire_event(self, event):
        # Firing an event so that rendering of widgets happen in the mainloop thread
        self.top.event_generate(event, when='tail')

    def destroy(self, *e):
        if self.top is None:
            return
        self.top.after_cancel(self.after)
        self.top.destroy()
        self.top = None

    def mainloop(self, *args, **kwargs):
        if in_idle():
            return
        self.top.mainloop(*args, **kwargs)

    def unfreeze_editable(self):
        self.set_paging_button_states()

    def set_paging_button_states(self):
        if self.current_page == -1 or self.current_page == 0:
            self.prev['state'] = 'disabled'
        else:
            self.prev['state'] = 'normal'
        if self.model.is_last_page(self.current_page):
            self.next['state'] = 'disabled'
        else:
            self.next['state'] = 'normal'

    def write_results(self, results):
        self.results_box['state'] = 'normal'
        row = 1
        for each in results:
            self.results_box.insert(str(row) + '.0', each[0] + " " + each[1] + "\n")
            row += 1
        self.results_box['state'] = 'disabled'


class CollocationsModel:
    def __init__(self, queue):
        self.result_count = None
        self.selected_corpus = None
        self.collocations = None
        self.CORPORA = _CORPORA
        self.DEFAULT_CORPUS = _DEFAULT
        self.queue = queue
        self.reset_results()

    def reset_results(self):
        self.result_pages = []
        self.results_returned = 0

    def load_corpus(self, name):
        self.selected_corpus = name
        self.collocations = None
        runner_thread = self.LoadCorpus(name, self)
        runner_thread.start()
        self.reset_results()

    def non_default_corpora(self):
        copy = []
        copy.extend(list(self.CORPORA.keys()))
        copy.remove(self.DEFAULT_CORPUS)
        copy.sort()
        return copy

    def is_last_page(self, number):
        if number < len(self.result_pages):
            return False
        return self.results_returned + (
            number - len(self.result_pages)
        ) * self.result_count >= len(self.collocations)

    def next(self, page):
        if (len(self.result_pages) - 1) < page:
            for i in range(page - (len(self.result_pages) - 1)):
                self.result_pages.append(
                    self.collocations[
                        self.results_returned : self.results_returned
                        + self.result_count
                    ]
                )
                self.results_returned += self.result_count
        return self.result_pages[page]

    def prev(self, page):
        if page == -1:
            return []
        return self.result_pages[page]

    class LoadCorpus(threading.Thread):
        def __init__(self, name, model):
            threading.Thread.__init__(self)
            self.model, self.name = model, name

        def run(self):
            try:
                words = self.model.CORPORA[self.name]()
                from operator import itemgetter

                text = [w for w in words if len(w) > 2]
                fd = FreqDist(tuple(text[i : i + 2]) for i in range(len(text) - 1))
                vocab = FreqDist(text)
                scored = [
                    ((w1, w2), fd[(w1, w2)] ** 3 / (vocab[w1] * vocab[w2]))
                    for w1, w2 in fd
                ]
                scored.sort(key=itemgetter(1), reverse=True)
                self.model.collocations = list(map(itemgetter(0), scored))
                self.model.queue.put(CORPUS_LOADED_EVENT)
            except Exception as e:
                print(e)
                self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)


# def collocations():
#    colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]]


[docs]def app(): c = CollocationsView() c.mainloop()
if __name__ == '__main__': app() __all__ = ['app']