Source code for nltk.sem.boxer

# Natural Language Toolkit: Interface to Boxer
# <http://svn.ask.it.usyd.edu.au/trac/candc/wiki/boxer>
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
# Copyright (C) 2001-2023 NLTK Project
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

"""
An interface to Boxer.

This interface relies on the latest version of the development (subversion) version of
C&C and Boxer.

Usage
=====

Set the environment variable CANDC to the bin directory of your CandC installation.
The models directory should be in the CandC root directory.
For example::

    /path/to/candc/
    bin/
        candc
        boxer
    models/
        boxer/
"""

import operator
import os
import re
import subprocess
import tempfile
from functools import reduce
from optparse import OptionParser

from nltk.internals import find_binary
from nltk.sem.drt import (
    DRS,
    DrtApplicationExpression,
    DrtEqualityExpression,
    DrtNegatedExpression,
    DrtOrExpression,
    DrtParser,
    DrtProposition,
    DrtTokens,
    DrtVariableExpression,
)
from nltk.sem.logic import (
    ExpectedMoreTokensException,
    LogicalExpressionException,
    UnexpectedTokenException,
    Variable,
)


[docs]class Boxer: """ This class is an interface to Johan Bos's program Boxer, a wide-coverage semantic parser that produces Discourse Representation Structures (DRSs). """
[docs] def __init__( self, boxer_drs_interpreter=None, elimeq=False, bin_dir=None, verbose=False, resolve=True, ): """ :param boxer_drs_interpreter: A class that converts from the ``AbstractBoxerDrs`` object hierarchy to a different object. The default is ``NltkDrtBoxerDrsInterpreter``, which converts to the NLTK DRT hierarchy. :param elimeq: When set to true, Boxer removes all equalities from the DRSs and discourse referents standing in the equality relation are unified, but only if this can be done in a meaning-preserving manner. :param resolve: When set to true, Boxer will resolve all anaphoric DRSs and perform merge-reduction. Resolution follows Van der Sandt's theory of binding and accommodation. """ if boxer_drs_interpreter is None: boxer_drs_interpreter = NltkDrtBoxerDrsInterpreter() self._boxer_drs_interpreter = boxer_drs_interpreter self._resolve = resolve self._elimeq = elimeq self.set_bin_dir(bin_dir, verbose)
[docs] def set_bin_dir(self, bin_dir, verbose=False): self._candc_bin = self._find_binary("candc", bin_dir, verbose) self._candc_models_path = os.path.normpath( os.path.join(self._candc_bin[:-5], "../models") ) self._boxer_bin = self._find_binary("boxer", bin_dir, verbose)
[docs] def interpret(self, input, discourse_id=None, question=False, verbose=False): """ Use Boxer to give a first order representation. :param input: str Input sentence to parse :param occur_index: bool Should predicates be occurrence indexed? :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate. :return: ``drt.DrtExpression`` """ discourse_ids = [discourse_id] if discourse_id is not None else None (d,) = self.interpret_multi_sents([[input]], discourse_ids, question, verbose) if not d: raise Exception(f'Unable to interpret: "{input}"') return d
[docs] def interpret_multi(self, input, discourse_id=None, question=False, verbose=False): """ Use Boxer to give a first order representation. :param input: list of str Input sentences to parse as a single discourse :param occur_index: bool Should predicates be occurrence indexed? :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate. :return: ``drt.DrtExpression`` """ discourse_ids = [discourse_id] if discourse_id is not None else None (d,) = self.interpret_multi_sents([input], discourse_ids, question, verbose) if not d: raise Exception(f'Unable to interpret: "{input}"') return d
[docs] def interpret_sents( self, inputs, discourse_ids=None, question=False, verbose=False ): """ Use Boxer to give a first order representation. :param inputs: list of str Input sentences to parse as individual discourses :param occur_index: bool Should predicates be occurrence indexed? :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate. :return: list of ``drt.DrtExpression`` """ return self.interpret_multi_sents( [[input] for input in inputs], discourse_ids, question, verbose )
[docs] def interpret_multi_sents( self, inputs, discourse_ids=None, question=False, verbose=False ): """ Use Boxer to give a first order representation. :param inputs: list of list of str Input discourses to parse :param occur_index: bool Should predicates be occurrence indexed? :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate. :return: ``drt.DrtExpression`` """ if discourse_ids is not None: assert len(inputs) == len(discourse_ids) assert reduce(operator.and_, (id is not None for id in discourse_ids)) use_disc_id = True else: discourse_ids = list(map(str, range(len(inputs)))) use_disc_id = False candc_out = self._call_candc(inputs, discourse_ids, question, verbose=verbose) boxer_out = self._call_boxer(candc_out, verbose=verbose) # if 'ERROR: input file contains no ccg/2 terms.' in boxer_out: # raise UnparseableInputException('Could not parse with candc: "%s"' % input_str) drs_dict = self._parse_to_drs_dict(boxer_out, use_disc_id) return [drs_dict.get(id, None) for id in discourse_ids]
def _call_candc(self, inputs, discourse_ids, question, verbose=False): """ Call the ``candc`` binary with the given input. :param inputs: list of list of str Input discourses to parse :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate. :param filename: str A filename for the output file :return: stdout """ args = [ "--models", os.path.join(self._candc_models_path, ["boxer", "questions"][question]), "--candc-printer", "boxer", ] return self._call( "\n".join( sum( ([f"<META>'{id}'"] + d for d, id in zip(inputs, discourse_ids)), [], ) ), self._candc_bin, args, verbose, ) def _call_boxer(self, candc_out, verbose=False): """ Call the ``boxer`` binary with the given input. :param candc_out: str output from C&C parser :return: stdout """ f = None try: fd, temp_filename = tempfile.mkstemp( prefix="boxer-", suffix=".in", text=True ) f = os.fdopen(fd, "w") f.write(candc_out.decode("utf-8")) finally: if f: f.close() args = [ "--box", "false", "--semantics", "drs", #'--flat', 'false', # removed from boxer "--resolve", ["false", "true"][self._resolve], "--elimeq", ["false", "true"][self._elimeq], "--format", "prolog", "--instantiate", "true", "--input", temp_filename, ] stdout = self._call(None, self._boxer_bin, args, verbose) os.remove(temp_filename) return stdout def _find_binary(self, name, bin_dir, verbose=False): return find_binary( name, path_to_bin=bin_dir, env_vars=["CANDC"], url="http://svn.ask.it.usyd.edu.au/trac/candc/", binary_names=[name, name + ".exe"], verbose=verbose, ) def _call(self, input_str, binary, args=[], verbose=False): """ Call the binary with the given input. :param input_str: A string whose contents are used as stdin. :param binary: The location of the binary to call :param args: A list of command-line arguments. :return: stdout """ if verbose: print("Calling:", binary) print("Args:", args) print("Input:", input_str) print("Command:", binary + " " + " ".join(args)) # Call via a subprocess if input_str is None: cmd = [binary] + args p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) else: cmd = 'echo "{}" | {} {}'.format(input_str, binary, " ".join(args)) p = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True ) stdout, stderr = p.communicate() if verbose: print("Return code:", p.returncode) if stdout: print("stdout:\n", stdout, "\n") if stderr: print("stderr:\n", stderr, "\n") if p.returncode != 0: raise Exception( "ERROR CALLING: {} {}\nReturncode: {}\n{}".format( binary, " ".join(args), p.returncode, stderr ) ) return stdout def _parse_to_drs_dict(self, boxer_out, use_disc_id): lines = boxer_out.decode("utf-8").split("\n") drs_dict = {} i = 0 while i < len(lines): line = lines[i] if line.startswith("id("): comma_idx = line.index(",") discourse_id = line[3:comma_idx] if discourse_id[0] == "'" and discourse_id[-1] == "'": discourse_id = discourse_id[1:-1] drs_id = line[comma_idx + 1 : line.index(")")] i += 1 line = lines[i] assert line.startswith(f"sem({drs_id},") if line[-4:] == "').'": line = line[:-4] + ")." assert line.endswith(")."), f"can't parse line: {line}" search_start = len(f"sem({drs_id},[") brace_count = 1 drs_start = -1 for j, c in enumerate(line[search_start:]): if c == "[": brace_count += 1 if c == "]": brace_count -= 1 if brace_count == 0: drs_start = search_start + j + 1 if line[drs_start : drs_start + 3] == "','": drs_start = drs_start + 3 else: drs_start = drs_start + 1 break assert drs_start > -1 drs_input = line[drs_start:-2].strip() parsed = self._parse_drs(drs_input, discourse_id, use_disc_id) drs_dict[discourse_id] = self._boxer_drs_interpreter.interpret(parsed) i += 1 return drs_dict def _parse_drs(self, drs_string, discourse_id, use_disc_id): return BoxerOutputDrsParser([None, discourse_id][use_disc_id]).parse(drs_string)
[docs]class BoxerOutputDrsParser(DrtParser):
[docs] def __init__(self, discourse_id=None): """ This class is used to parse the Prolog DRS output from Boxer into a hierarchy of python objects. """ DrtParser.__init__(self) self.discourse_id = discourse_id self.sentence_id_offset = None self.quote_chars = [("'", "'", "\\", False)]
[docs] def parse(self, data, signature=None): return DrtParser.parse(self, data, signature)
[docs] def get_all_symbols(self): return ["(", ")", ",", "[", "]", ":"]
[docs] def handle(self, tok, context): return self.handle_drs(tok)
[docs] def attempt_adjuncts(self, expression, context): return expression
[docs] def parse_condition(self, indices): """ Parse a DRS condition :return: list of ``DrtExpression`` """ tok = self.token() accum = self.handle_condition(tok, indices) if accum is None: raise UnexpectedTokenException(tok) return accum
[docs] def handle_drs(self, tok): if tok == "drs": return self.parse_drs() elif tok in ["merge", "smerge"]: return self._handle_binary_expression(self._make_merge_expression)(None, []) elif tok in ["alfa"]: return self._handle_alfa(self._make_merge_expression)(None, [])
[docs] def handle_condition(self, tok, indices): """ Handle a DRS condition :param indices: list of int :return: list of ``DrtExpression`` """ if tok == "not": return [self._handle_not()] if tok == "or": conds = [self._handle_binary_expression(self._make_or_expression)] elif tok == "imp": conds = [self._handle_binary_expression(self._make_imp_expression)] elif tok == "eq": conds = [self._handle_eq()] elif tok == "prop": conds = [self._handle_prop()] elif tok == "pred": conds = [self._handle_pred()] elif tok == "named": conds = [self._handle_named()] elif tok == "rel": conds = [self._handle_rel()] elif tok == "timex": conds = self._handle_timex() elif tok == "card": conds = [self._handle_card()] elif tok == "whq": conds = [self._handle_whq()] elif tok == "duplex": conds = [self._handle_duplex()] else: conds = [] return sum( ( [cond(sent_index, word_indices) for cond in conds] for sent_index, word_indices in self._sent_and_word_indices(indices) ), [], )
def _handle_not(self): self.assertToken(self.token(), "(") drs = self.process_next_expression(None) self.assertToken(self.token(), ")") return BoxerNot(drs) def _handle_pred(self): # pred(_G3943, dog, n, 0) self.assertToken(self.token(), "(") variable = self.parse_variable() self.assertToken(self.token(), ",") name = self.token() self.assertToken(self.token(), ",") pos = self.token() self.assertToken(self.token(), ",") sense = int(self.token()) self.assertToken(self.token(), ")") def _handle_pred_f(sent_index, word_indices): return BoxerPred( self.discourse_id, sent_index, word_indices, variable, name, pos, sense ) return _handle_pred_f def _handle_duplex(self): # duplex(whq, drs(...), var, drs(...)) self.assertToken(self.token(), "(") # self.assertToken(self.token(), '[') ans_types = [] # while self.token(0) != ']': # cat = self.token() # self.assertToken(self.token(), ':') # if cat == 'des': # ans_types.append(self.token()) # elif cat == 'num': # ans_types.append('number') # typ = self.token() # if typ == 'cou': # ans_types.append('count') # else: # ans_types.append(typ) # else: # ans_types.append(self.token()) # self.token() #swallow the ']' self.assertToken(self.token(), "whq") self.assertToken(self.token(), ",") d1 = self.process_next_expression(None) self.assertToken(self.token(), ",") ref = self.parse_variable() self.assertToken(self.token(), ",") d2 = self.process_next_expression(None) self.assertToken(self.token(), ")") return lambda sent_index, word_indices: BoxerWhq( self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2 ) def _handle_named(self): # named(x0, john, per, 0) self.assertToken(self.token(), "(") variable = self.parse_variable() self.assertToken(self.token(), ",") name = self.token() self.assertToken(self.token(), ",") type = self.token() self.assertToken(self.token(), ",") sense = self.token() # as per boxer rev 2554 self.assertToken(self.token(), ")") return lambda sent_index, word_indices: BoxerNamed( self.discourse_id, sent_index, word_indices, variable, name, type, sense ) def _handle_rel(self): # rel(_G3993, _G3943, agent, 0) self.assertToken(self.token(), "(") var1 = self.parse_variable() self.assertToken(self.token(), ",") var2 = self.parse_variable() self.assertToken(self.token(), ",") rel = self.token() self.assertToken(self.token(), ",") sense = int(self.token()) self.assertToken(self.token(), ")") return lambda sent_index, word_indices: BoxerRel( self.discourse_id, sent_index, word_indices, var1, var2, rel, sense ) def _handle_timex(self): # timex(_G18322, date([]: (+), []:'XXXX', [1004]:'04', []:'XX')) self.assertToken(self.token(), "(") arg = self.parse_variable() self.assertToken(self.token(), ",") new_conds = self._handle_time_expression(arg) self.assertToken(self.token(), ")") return new_conds def _handle_time_expression(self, arg): # date([]: (+), []:'XXXX', [1004]:'04', []:'XX') tok = self.token() self.assertToken(self.token(), "(") if tok == "date": conds = self._handle_date(arg) elif tok == "time": conds = self._handle_time(arg) else: return None self.assertToken(self.token(), ")") return [ lambda sent_index, word_indices: BoxerPred( self.discourse_id, sent_index, word_indices, arg, tok, "n", 0 ) ] + [lambda sent_index, word_indices: cond for cond in conds] def _handle_date(self, arg): # []: (+), []:'XXXX', [1004]:'04', []:'XX' conds = [] ((sent_index, word_indices),) = self._sent_and_word_indices( self._parse_index_list() ) self.assertToken(self.token(), "(") pol = self.token() self.assertToken(self.token(), ")") conds.append( BoxerPred( self.discourse_id, sent_index, word_indices, arg, f"date_pol_{pol}", "a", 0, ) ) self.assertToken(self.token(), ",") ((sent_index, word_indices),) = self._sent_and_word_indices( self._parse_index_list() ) year = self.token() if year != "XXXX": year = year.replace(":", "_") conds.append( BoxerPred( self.discourse_id, sent_index, word_indices, arg, f"date_year_{year}", "a", 0, ) ) self.assertToken(self.token(), ",") ((sent_index, word_indices),) = self._sent_and_word_indices( self._parse_index_list() ) month = self.token() if month != "XX": conds.append( BoxerPred( self.discourse_id, sent_index, word_indices, arg, f"date_month_{month}", "a", 0, ) ) self.assertToken(self.token(), ",") ((sent_index, word_indices),) = self._sent_and_word_indices( self._parse_index_list() ) day = self.token() if day != "XX": conds.append( BoxerPred( self.discourse_id, sent_index, word_indices, arg, f"date_day_{day}", "a", 0, ) ) return conds def _handle_time(self, arg): # time([1018]:'18', []:'XX', []:'XX') conds = [] self._parse_index_list() hour = self.token() if hour != "XX": conds.append(self._make_atom("r_hour_2", arg, hour)) self.assertToken(self.token(), ",") self._parse_index_list() min = self.token() if min != "XX": conds.append(self._make_atom("r_min_2", arg, min)) self.assertToken(self.token(), ",") self._parse_index_list() sec = self.token() if sec != "XX": conds.append(self._make_atom("r_sec_2", arg, sec)) return conds def _handle_card(self): # card(_G18535, 28, ge) self.assertToken(self.token(), "(") variable = self.parse_variable() self.assertToken(self.token(), ",") value = self.token() self.assertToken(self.token(), ",") type = self.token() self.assertToken(self.token(), ")") return lambda sent_index, word_indices: BoxerCard( self.discourse_id, sent_index, word_indices, variable, value, type ) def _handle_prop(self): # prop(_G15949, drs(...)) self.assertToken(self.token(), "(") variable = self.parse_variable() self.assertToken(self.token(), ",") drs = self.process_next_expression(None) self.assertToken(self.token(), ")") return lambda sent_index, word_indices: BoxerProp( self.discourse_id, sent_index, word_indices, variable, drs ) def _parse_index_list(self): # [1001,1002]: indices = [] self.assertToken(self.token(), "[") while self.token(0) != "]": indices.append(self.parse_index()) if self.token(0) == ",": self.token() # swallow ',' self.token() # swallow ']' self.assertToken(self.token(), ":") return indices
[docs] def parse_drs(self): # drs([[1001]:_G3943], # [[1002]:pred(_G3943, dog, n, 0)] # ) self.assertToken(self.token(), "(") self.assertToken(self.token(), "[") refs = set() while self.token(0) != "]": indices = self._parse_index_list() refs.add(self.parse_variable()) if self.token(0) == ",": self.token() # swallow ',' self.token() # swallow ']' self.assertToken(self.token(), ",") self.assertToken(self.token(), "[") conds = [] while self.token(0) != "]": indices = self._parse_index_list() conds.extend(self.parse_condition(indices)) if self.token(0) == ",": self.token() # swallow ',' self.token() # swallow ']' self.assertToken(self.token(), ")") return BoxerDrs(list(refs), conds)
def _handle_binary_expression(self, make_callback): self.assertToken(self.token(), "(") drs1 = self.process_next_expression(None) self.assertToken(self.token(), ",") drs2 = self.process_next_expression(None) self.assertToken(self.token(), ")") return lambda sent_index, word_indices: make_callback( sent_index, word_indices, drs1, drs2 ) def _handle_alfa(self, make_callback): self.assertToken(self.token(), "(") type = self.token() self.assertToken(self.token(), ",") drs1 = self.process_next_expression(None) self.assertToken(self.token(), ",") drs2 = self.process_next_expression(None) self.assertToken(self.token(), ")") return lambda sent_index, word_indices: make_callback( sent_index, word_indices, drs1, drs2 ) def _handle_eq(self): self.assertToken(self.token(), "(") var1 = self.parse_variable() self.assertToken(self.token(), ",") var2 = self.parse_variable() self.assertToken(self.token(), ")") return lambda sent_index, word_indices: BoxerEq( self.discourse_id, sent_index, word_indices, var1, var2 ) def _handle_whq(self): self.assertToken(self.token(), "(") self.assertToken(self.token(), "[") ans_types = [] while self.token(0) != "]": cat = self.token() self.assertToken(self.token(), ":") if cat == "des": ans_types.append(self.token()) elif cat == "num": ans_types.append("number") typ = self.token() if typ == "cou": ans_types.append("count") else: ans_types.append(typ) else: ans_types.append(self.token()) self.token() # swallow the ']' self.assertToken(self.token(), ",") d1 = self.process_next_expression(None) self.assertToken(self.token(), ",") ref = self.parse_variable() self.assertToken(self.token(), ",") d2 = self.process_next_expression(None) self.assertToken(self.token(), ")") return lambda sent_index, word_indices: BoxerWhq( self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2 ) def _make_merge_expression(self, sent_index, word_indices, drs1, drs2): return BoxerDrs(drs1.refs + drs2.refs, drs1.conds + drs2.conds) def _make_or_expression(self, sent_index, word_indices, drs1, drs2): return BoxerOr(self.discourse_id, sent_index, word_indices, drs1, drs2) def _make_imp_expression(self, sent_index, word_indices, drs1, drs2): return BoxerDrs(drs1.refs, drs1.conds, drs2)
[docs] def parse_variable(self): var = self.token() assert re.match(r"^[exps]\d+$", var), var return var
[docs] def parse_index(self): return int(self.token())
def _sent_and_word_indices(self, indices): """ :return: list of (sent_index, word_indices) tuples """ sent_indices = {(i / 1000) - 1 for i in indices if i >= 0} if sent_indices: pairs = [] for sent_index in sent_indices: word_indices = [ (i % 1000) - 1 for i in indices if sent_index == (i / 1000) - 1 ] pairs.append((sent_index, word_indices)) return pairs else: word_indices = [(i % 1000) - 1 for i in indices] return [(None, word_indices)]
[docs]class BoxerDrsParser(DrtParser): """ Reparse the str form of subclasses of ``AbstractBoxerDrs`` """
[docs] def __init__(self, discourse_id=None): DrtParser.__init__(self) self.discourse_id = discourse_id
[docs] def get_all_symbols(self): return [ DrtTokens.OPEN, DrtTokens.CLOSE, DrtTokens.COMMA, DrtTokens.OPEN_BRACKET, DrtTokens.CLOSE_BRACKET, ]
[docs] def attempt_adjuncts(self, expression, context): return expression
[docs] def handle(self, tok, context): try: # if tok == 'drs': # self.assertNextToken(DrtTokens.OPEN) # label = int(self.token()) # self.assertNextToken(DrtTokens.COMMA) # refs = list(map(int, self.handle_refs())) # self.assertNextToken(DrtTokens.COMMA) # conds = self.handle_conds(None) # self.assertNextToken(DrtTokens.CLOSE) # return BoxerDrs(label, refs, conds) if tok == "pred": self.assertNextToken(DrtTokens.OPEN) disc_id = ( self.discourse_id if self.discourse_id is not None else self.token() ) self.assertNextToken(DrtTokens.COMMA) sent_id = self.nullableIntToken() self.assertNextToken(DrtTokens.COMMA) word_ids = list(map(int, self.handle_refs())) self.assertNextToken(DrtTokens.COMMA) variable = int(self.token()) self.assertNextToken(DrtTokens.COMMA) name = self.token() self.assertNextToken(DrtTokens.COMMA) pos = self.token() self.assertNextToken(DrtTokens.COMMA) sense = int(self.token()) self.assertNextToken(DrtTokens.CLOSE) return BoxerPred(disc_id, sent_id, word_ids, variable, name, pos, sense) elif tok == "named": self.assertNextToken(DrtTokens.OPEN) disc_id = ( self.discourse_id if self.discourse_id is not None else self.token() ) self.assertNextToken(DrtTokens.COMMA) sent_id = int(self.token()) self.assertNextToken(DrtTokens.COMMA) word_ids = map(int, self.handle_refs()) self.assertNextToken(DrtTokens.COMMA) variable = int(self.token()) self.assertNextToken(DrtTokens.COMMA) name = self.token() self.assertNextToken(DrtTokens.COMMA) type = self.token() self.assertNextToken(DrtTokens.COMMA) sense = int(self.token()) self.assertNextToken(DrtTokens.CLOSE) return BoxerNamed( disc_id, sent_id, word_ids, variable, name, type, sense ) elif tok == "rel": self.assertNextToken(DrtTokens.OPEN) disc_id = ( self.discourse_id if self.discourse_id is not None else self.token() ) self.assertNextToken(DrtTokens.COMMA) sent_id = self.nullableIntToken() self.assertNextToken(DrtTokens.COMMA) word_ids = list(map(int, self.handle_refs())) self.assertNextToken(DrtTokens.COMMA) var1 = int(self.token()) self.assertNextToken(DrtTokens.COMMA) var2 = int(self.token()) self.assertNextToken(DrtTokens.COMMA) rel = self.token() self.assertNextToken(DrtTokens.COMMA) sense = int(self.token()) self.assertNextToken(DrtTokens.CLOSE) return BoxerRel(disc_id, sent_id, word_ids, var1, var2, rel, sense) elif tok == "prop": self.assertNextToken(DrtTokens.OPEN) disc_id = ( self.discourse_id if self.discourse_id is not None else self.token() ) self.assertNextToken(DrtTokens.COMMA) sent_id = int(self.token()) self.assertNextToken(DrtTokens.COMMA) word_ids = list(map(int, self.handle_refs())) self.assertNextToken(DrtTokens.COMMA) variable = int(self.token()) self.assertNextToken(DrtTokens.COMMA) drs = self.process_next_expression(None) self.assertNextToken(DrtTokens.CLOSE) return BoxerProp(disc_id, sent_id, word_ids, variable, drs) elif tok == "not": self.assertNextToken(DrtTokens.OPEN) drs = self.process_next_expression(None) self.assertNextToken(DrtTokens.CLOSE) return BoxerNot(drs) elif tok == "imp": self.assertNextToken(DrtTokens.OPEN) drs1 = self.process_next_expression(None) self.assertNextToken(DrtTokens.COMMA) drs2 = self.process_next_expression(None) self.assertNextToken(DrtTokens.CLOSE) return BoxerDrs(drs1.refs, drs1.conds, drs2) elif tok == "or": self.assertNextToken(DrtTokens.OPEN) disc_id = ( self.discourse_id if self.discourse_id is not None else self.token() ) self.assertNextToken(DrtTokens.COMMA) sent_id = self.nullableIntToken() self.assertNextToken(DrtTokens.COMMA) word_ids = map(int, self.handle_refs()) self.assertNextToken(DrtTokens.COMMA) drs1 = self.process_next_expression(None) self.assertNextToken(DrtTokens.COMMA) drs2 = self.process_next_expression(None) self.assertNextToken(DrtTokens.CLOSE) return BoxerOr(disc_id, sent_id, word_ids, drs1, drs2) elif tok == "eq": self.assertNextToken(DrtTokens.OPEN) disc_id = ( self.discourse_id if self.discourse_id is not None else self.token() ) self.assertNextToken(DrtTokens.COMMA) sent_id = self.nullableIntToken() self.assertNextToken(DrtTokens.COMMA) word_ids = list(map(int, self.handle_refs())) self.assertNextToken(DrtTokens.COMMA) var1 = int(self.token()) self.assertNextToken(DrtTokens.COMMA) var2 = int(self.token()) self.assertNextToken(DrtTokens.CLOSE) return BoxerEq(disc_id, sent_id, word_ids, var1, var2) elif tok == "card": self.assertNextToken(DrtTokens.OPEN) disc_id = ( self.discourse_id if self.discourse_id is not None else self.token() ) self.assertNextToken(DrtTokens.COMMA) sent_id = self.nullableIntToken() self.assertNextToken(DrtTokens.COMMA) word_ids = map(int, self.handle_refs()) self.assertNextToken(DrtTokens.COMMA) var = int(self.token()) self.assertNextToken(DrtTokens.COMMA) value = self.token() self.assertNextToken(DrtTokens.COMMA) type = self.token() self.assertNextToken(DrtTokens.CLOSE) return BoxerCard(disc_id, sent_id, word_ids, var, value, type) elif tok == "whq": self.assertNextToken(DrtTokens.OPEN) disc_id = ( self.discourse_id if self.discourse_id is not None else self.token() ) self.assertNextToken(DrtTokens.COMMA) sent_id = self.nullableIntToken() self.assertNextToken(DrtTokens.COMMA) word_ids = list(map(int, self.handle_refs())) self.assertNextToken(DrtTokens.COMMA) ans_types = self.handle_refs() self.assertNextToken(DrtTokens.COMMA) drs1 = self.process_next_expression(None) self.assertNextToken(DrtTokens.COMMA) var = int(self.token()) self.assertNextToken(DrtTokens.COMMA) drs2 = self.process_next_expression(None) self.assertNextToken(DrtTokens.CLOSE) return BoxerWhq(disc_id, sent_id, word_ids, ans_types, drs1, var, drs2) except Exception as e: raise LogicalExpressionException(self._currentIndex, str(e)) from e assert False, repr(tok)
[docs] def nullableIntToken(self): t = self.token() return int(t) if t != "None" else None
[docs] def get_next_token_variable(self, description): try: return self.token() except ExpectedMoreTokensException as e: raise ExpectedMoreTokensException(e.index, "Variable expected.") from e
[docs]class AbstractBoxerDrs:
[docs] def variables(self): """ :return: (set<variables>, set<events>, set<propositions>) """ variables, events, propositions = self._variables() return (variables - (events | propositions), events, propositions - events)
[docs] def variable_types(self): vartypes = {} for t, vars in zip(("z", "e", "p"), self.variables()): for v in vars: vartypes[v] = t return vartypes
def _variables(self): """ :return: (set<variables>, set<events>, set<propositions>) """ return (set(), set(), set())
[docs] def atoms(self): return set()
[docs] def clean(self): return self
def _clean_name(self, name): return name.replace("-", "_").replace("'", "_")
[docs] def renumber_sentences(self, f): return self
def __hash__(self): return hash(f"{self}")
[docs]class BoxerDrs(AbstractBoxerDrs):
[docs] def __init__(self, refs, conds, consequent=None): AbstractBoxerDrs.__init__(self) self.refs = refs self.conds = conds self.consequent = consequent
def _variables(self): variables = (set(), set(), set()) for cond in self.conds: for s, v in zip(variables, cond._variables()): s.update(v) if self.consequent is not None: for s, v in zip(variables, self.consequent._variables()): s.update(v) return variables
[docs] def atoms(self): atoms = reduce(operator.or_, (cond.atoms() for cond in self.conds), set()) if self.consequent is not None: atoms.update(self.consequent.atoms()) return atoms
[docs] def clean(self): consequent = self.consequent.clean() if self.consequent else None return BoxerDrs(self.refs, [c.clean() for c in self.conds], consequent)
[docs] def renumber_sentences(self, f): consequent = self.consequent.renumber_sentences(f) if self.consequent else None return BoxerDrs( self.refs, [c.renumber_sentences(f) for c in self.conds], consequent )
def __repr__(self): s = "drs([{}], [{}])".format( ", ".join("%s" % r for r in self.refs), ", ".join("%s" % c for c in self.conds), ) if self.consequent is not None: s = f"imp({s}, {self.consequent})" return s def __eq__(self, other): return ( self.__class__ == other.__class__ and self.refs == other.refs and len(self.conds) == len(other.conds) and reduce( operator.and_, (c1 == c2 for c1, c2 in zip(self.conds, other.conds)) ) and self.consequent == other.consequent ) def __ne__(self, other): return not self == other __hash__ = AbstractBoxerDrs.__hash__
[docs]class BoxerNot(AbstractBoxerDrs):
[docs] def __init__(self, drs): AbstractBoxerDrs.__init__(self) self.drs = drs
def _variables(self): return self.drs._variables()
[docs] def atoms(self): return self.drs.atoms()
[docs] def clean(self): return BoxerNot(self.drs.clean())
[docs] def renumber_sentences(self, f): return BoxerNot(self.drs.renumber_sentences(f))
def __repr__(self): return "not(%s)" % (self.drs) def __eq__(self, other): return self.__class__ == other.__class__ and self.drs == other.drs def __ne__(self, other): return not self == other __hash__ = AbstractBoxerDrs.__hash__
[docs]class BoxerIndexed(AbstractBoxerDrs):
[docs] def __init__(self, discourse_id, sent_index, word_indices): AbstractBoxerDrs.__init__(self) self.discourse_id = discourse_id self.sent_index = sent_index self.word_indices = word_indices
[docs] def atoms(self): return {self}
def __eq__(self, other): return ( self.__class__ == other.__class__ and self.discourse_id == other.discourse_id and self.sent_index == other.sent_index and self.word_indices == other.word_indices and reduce(operator.and_, (s == o for s, o in zip(self, other))) ) def __ne__(self, other): return not self == other __hash__ = AbstractBoxerDrs.__hash__ def __repr__(self): s = "{}({}, {}, [{}]".format( self._pred(), self.discourse_id, self.sent_index, ", ".join("%s" % wi for wi in self.word_indices), ) for v in self: s += ", %s" % v return s + ")"
[docs]class BoxerPred(BoxerIndexed):
[docs] def __init__(self, discourse_id, sent_index, word_indices, var, name, pos, sense): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.var = var self.name = name self.pos = pos self.sense = sense
def _variables(self): return ({self.var}, set(), set())
[docs] def change_var(self, var): return BoxerPred( self.discourse_id, self.sent_index, self.word_indices, var, self.name, self.pos, self.sense, )
[docs] def clean(self): return BoxerPred( self.discourse_id, self.sent_index, self.word_indices, self.var, self._clean_name(self.name), self.pos, self.sense, )
[docs] def renumber_sentences(self, f): new_sent_index = f(self.sent_index) return BoxerPred( self.discourse_id, new_sent_index, self.word_indices, self.var, self.name, self.pos, self.sense, )
def __iter__(self): return iter((self.var, self.name, self.pos, self.sense)) def _pred(self): return "pred"
[docs]class BoxerNamed(BoxerIndexed):
[docs] def __init__(self, discourse_id, sent_index, word_indices, var, name, type, sense): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.var = var self.name = name self.type = type self.sense = sense
def _variables(self): return ({self.var}, set(), set())
[docs] def change_var(self, var): return BoxerNamed( self.discourse_id, self.sent_index, self.word_indices, var, self.name, self.type, self.sense, )
[docs] def clean(self): return BoxerNamed( self.discourse_id, self.sent_index, self.word_indices, self.var, self._clean_name(self.name), self.type, self.sense, )
[docs] def renumber_sentences(self, f): return BoxerNamed( self.discourse_id, f(self.sent_index), self.word_indices, self.var, self.name, self.type, self.sense, )
def __iter__(self): return iter((self.var, self.name, self.type, self.sense)) def _pred(self): return "named"
[docs]class BoxerRel(BoxerIndexed):
[docs] def __init__(self, discourse_id, sent_index, word_indices, var1, var2, rel, sense): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.var1 = var1 self.var2 = var2 self.rel = rel self.sense = sense
def _variables(self): return ({self.var1, self.var2}, set(), set())
[docs] def clean(self): return BoxerRel( self.discourse_id, self.sent_index, self.word_indices, self.var1, self.var2, self._clean_name(self.rel), self.sense, )
[docs] def renumber_sentences(self, f): return BoxerRel( self.discourse_id, f(self.sent_index), self.word_indices, self.var1, self.var2, self.rel, self.sense, )
def __iter__(self): return iter((self.var1, self.var2, self.rel, self.sense)) def _pred(self): return "rel"
[docs]class BoxerProp(BoxerIndexed):
[docs] def __init__(self, discourse_id, sent_index, word_indices, var, drs): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.var = var self.drs = drs
def _variables(self): return tuple( map(operator.or_, (set(), set(), {self.var}), self.drs._variables()) )
[docs] def referenced_labels(self): return {self.drs}
[docs] def atoms(self): return self.drs.atoms()
[docs] def clean(self): return BoxerProp( self.discourse_id, self.sent_index, self.word_indices, self.var, self.drs.clean(), )
[docs] def renumber_sentences(self, f): return BoxerProp( self.discourse_id, f(self.sent_index), self.word_indices, self.var, self.drs.renumber_sentences(f), )
def __iter__(self): return iter((self.var, self.drs)) def _pred(self): return "prop"
[docs]class BoxerEq(BoxerIndexed):
[docs] def __init__(self, discourse_id, sent_index, word_indices, var1, var2): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.var1 = var1 self.var2 = var2
def _variables(self): return ({self.var1, self.var2}, set(), set())
[docs] def atoms(self): return set()
[docs] def renumber_sentences(self, f): return BoxerEq( self.discourse_id, f(self.sent_index), self.word_indices, self.var1, self.var2, )
def __iter__(self): return iter((self.var1, self.var2)) def _pred(self): return "eq"
[docs]class BoxerCard(BoxerIndexed):
[docs] def __init__(self, discourse_id, sent_index, word_indices, var, value, type): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.var = var self.value = value self.type = type
def _variables(self): return ({self.var}, set(), set())
[docs] def renumber_sentences(self, f): return BoxerCard( self.discourse_id, f(self.sent_index), self.word_indices, self.var, self.value, self.type, )
def __iter__(self): return iter((self.var, self.value, self.type)) def _pred(self): return "card"
[docs]class BoxerOr(BoxerIndexed):
[docs] def __init__(self, discourse_id, sent_index, word_indices, drs1, drs2): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.drs1 = drs1 self.drs2 = drs2
def _variables(self): return tuple(map(operator.or_, self.drs1._variables(), self.drs2._variables()))
[docs] def atoms(self): return self.drs1.atoms() | self.drs2.atoms()
[docs] def clean(self): return BoxerOr( self.discourse_id, self.sent_index, self.word_indices, self.drs1.clean(), self.drs2.clean(), )
[docs] def renumber_sentences(self, f): return BoxerOr( self.discourse_id, f(self.sent_index), self.word_indices, self.drs1, self.drs2, )
def __iter__(self): return iter((self.drs1, self.drs2)) def _pred(self): return "or"
[docs]class BoxerWhq(BoxerIndexed):
[docs] def __init__( self, discourse_id, sent_index, word_indices, ans_types, drs1, variable, drs2 ): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.ans_types = ans_types self.drs1 = drs1 self.variable = variable self.drs2 = drs2
def _variables(self): return tuple( map( operator.or_, ({self.variable}, set(), set()), self.drs1._variables(), self.drs2._variables(), ) )
[docs] def atoms(self): return self.drs1.atoms() | self.drs2.atoms()
[docs] def clean(self): return BoxerWhq( self.discourse_id, self.sent_index, self.word_indices, self.ans_types, self.drs1.clean(), self.variable, self.drs2.clean(), )
[docs] def renumber_sentences(self, f): return BoxerWhq( self.discourse_id, f(self.sent_index), self.word_indices, self.ans_types, self.drs1, self.variable, self.drs2, )
def __iter__(self): return iter( ("[" + ",".join(self.ans_types) + "]", self.drs1, self.variable, self.drs2) ) def _pred(self): return "whq"
[docs]class PassthroughBoxerDrsInterpreter:
[docs] def interpret(self, ex): return ex
[docs]class NltkDrtBoxerDrsInterpreter:
[docs] def __init__(self, occur_index=False): self._occur_index = occur_index
[docs] def interpret(self, ex): """ :param ex: ``AbstractBoxerDrs`` :return: ``DrtExpression`` """ if isinstance(ex, BoxerDrs): drs = DRS( [Variable(r) for r in ex.refs], list(map(self.interpret, ex.conds)) ) if ex.consequent is not None: drs.consequent = self.interpret(ex.consequent) return drs elif isinstance(ex, BoxerNot): return DrtNegatedExpression(self.interpret(ex.drs)) elif isinstance(ex, BoxerPred): pred = self._add_occur_indexing(f"{ex.pos}_{ex.name}", ex) return self._make_atom(pred, ex.var) elif isinstance(ex, BoxerNamed): pred = self._add_occur_indexing(f"ne_{ex.type}_{ex.name}", ex) return self._make_atom(pred, ex.var) elif isinstance(ex, BoxerRel): pred = self._add_occur_indexing("%s" % (ex.rel), ex) return self._make_atom(pred, ex.var1, ex.var2) elif isinstance(ex, BoxerProp): return DrtProposition(Variable(ex.var), self.interpret(ex.drs)) elif isinstance(ex, BoxerEq): return DrtEqualityExpression( DrtVariableExpression(Variable(ex.var1)), DrtVariableExpression(Variable(ex.var2)), ) elif isinstance(ex, BoxerCard): pred = self._add_occur_indexing(f"card_{ex.type}_{ex.value}", ex) return self._make_atom(pred, ex.var) elif isinstance(ex, BoxerOr): return DrtOrExpression(self.interpret(ex.drs1), self.interpret(ex.drs2)) elif isinstance(ex, BoxerWhq): drs1 = self.interpret(ex.drs1) drs2 = self.interpret(ex.drs2) return DRS(drs1.refs + drs2.refs, drs1.conds + drs2.conds) assert False, f"{ex.__class__.__name__}: {ex}"
def _make_atom(self, pred, *args): accum = DrtVariableExpression(Variable(pred)) for arg in args: accum = DrtApplicationExpression( accum, DrtVariableExpression(Variable(arg)) ) return accum def _add_occur_indexing(self, base, ex): if self._occur_index and ex.sent_index is not None: if ex.discourse_id: base += "_%s" % ex.discourse_id base += "_s%s" % ex.sent_index base += "_w%s" % sorted(ex.word_indices)[0] return base
[docs]class UnparseableInputException(Exception): pass
if __name__ == "__main__": opts = OptionParser("usage: %prog TEXT [options]") opts.add_option( "--verbose", "-v", help="display verbose logs", action="store_true", default=False, dest="verbose", ) opts.add_option( "--fol", "-f", help="output FOL", action="store_true", default=False, dest="fol" ) opts.add_option( "--question", "-q", help="input is a question", action="store_true", default=False, dest="question", ) opts.add_option( "--occur", "-o", help="occurrence index", action="store_true", default=False, dest="occur_index", ) (options, args) = opts.parse_args() if len(args) != 1: opts.error("incorrect number of arguments") interpreter = NltkDrtBoxerDrsInterpreter(occur_index=options.occur_index) drs = Boxer(interpreter).interpret_multi( args[0].split(r"\n"), question=options.question, verbose=options.verbose ) if drs is None: print(None) else: drs = drs.simplify().eliminate_equality() if options.fol: print(drs.fol().normalize()) else: drs.pretty_print()