Source code for nltk.sem.glue

# Natural Language Toolkit: Glue Semantics
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
# Copyright (C) 2001-2021 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

import os
from itertools import chain

import nltk
from nltk.internals import Counter
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger, RegexpTagger
from nltk.sem.logic import (
    Expression,
    Variable,
    VariableExpression,
    LambdaExpression,
    AbstractVariableExpression,
)
from nltk.sem import drt
from nltk.sem import linearlogic

SPEC_SEMTYPES = {
    "a": "ex_quant",
    "an": "ex_quant",
    "every": "univ_quant",
    "the": "def_art",
    "no": "no_quant",
    "default": "ex_quant",
}

OPTIONAL_RELATIONSHIPS = ["nmod", "vmod", "punct"]


[docs]class GlueFormula(object): def __init__(self, meaning, glue, indices=None): if not indices: indices = set() if isinstance(meaning, str): self.meaning = Expression.fromstring(meaning) elif isinstance(meaning, Expression): self.meaning = meaning else: raise RuntimeError( "Meaning term neither string or expression: %s, %s" % (meaning, meaning.__class__) ) if isinstance(glue, str): self.glue = linearlogic.LinearLogicParser().parse(glue) elif isinstance(glue, linearlogic.Expression): self.glue = glue else: raise RuntimeError( "Glue term neither string or expression: %s, %s" % (glue, glue.__class__) ) self.indices = indices
[docs] def applyto(self, arg): """ self = (\\x.(walk x), (subj -o f)) arg = (john , subj) returns ((walk john), f) """ if self.indices & arg.indices: # if the sets are NOT disjoint raise linearlogic.LinearLogicApplicationException( "'%s' applied to '%s'. Indices are not disjoint." % (self, arg) ) else: # if the sets ARE disjoint return_indices = self.indices | arg.indices try: return_glue = linearlogic.ApplicationExpression( self.glue, arg.glue, arg.indices ) except linearlogic.LinearLogicApplicationException as e: raise linearlogic.LinearLogicApplicationException( "'%s' applied to '%s'" % (self.simplify(), arg.simplify()) ) from e arg_meaning_abstracted = arg.meaning if return_indices: for dep in self.glue.simplify().antecedent.dependencies[ ::-1 ]: # if self.glue is (A -o B), dep is in A.dependencies arg_meaning_abstracted = self.make_LambdaExpression( Variable("v%s" % dep), arg_meaning_abstracted ) return_meaning = self.meaning.applyto(arg_meaning_abstracted) return self.__class__(return_meaning, return_glue, return_indices)
[docs] def make_VariableExpression(self, name): return VariableExpression(name)
[docs] def make_LambdaExpression(self, variable, term): return LambdaExpression(variable, term)
[docs] def lambda_abstract(self, other): assert isinstance(other, GlueFormula) assert isinstance(other.meaning, AbstractVariableExpression) return self.__class__( self.make_LambdaExpression(other.meaning.variable, self.meaning), linearlogic.ImpExpression(other.glue, self.glue), )
[docs] def compile(self, counter=None): """From Iddo Lev's PhD Dissertation p108-109""" if not counter: counter = Counter() (compiled_glue, new_forms) = self.glue.simplify().compile_pos( counter, self.__class__ ) return new_forms + [ self.__class__(self.meaning, compiled_glue, set([counter.get()])) ]
[docs] def simplify(self): return self.__class__( self.meaning.simplify(), self.glue.simplify(), self.indices )
def __eq__(self, other): return ( self.__class__ == other.__class__ and self.meaning == other.meaning and self.glue == other.glue ) def __ne__(self, other): return not self == other # sorting for use in doctests which must be deterministic def __lt__(self, other): return str(self) < str(other) def __str__(self): assert isinstance(self.indices, set) accum = "%s : %s" % (self.meaning, self.glue) if self.indices: accum += " : {" + ", ".join(str(index) for index in sorted(self.indices)) + "}" return accum def __repr__(self): return "%s" % self
[docs]class GlueDict(dict): def __init__(self, filename, encoding=None): self.filename = filename self.file_encoding = encoding self.read_file()
[docs] def read_file(self, empty_first=True): if empty_first: self.clear() try: contents = nltk.data.load( self.filename, format="text", encoding=self.file_encoding ) # TODO: the above can't handle zip files, but this should anyway be fixed in nltk.data.load() except LookupError as e: try: contents = nltk.data.load( "file:" + self.filename, format="text", encoding=self.file_encoding ) except LookupError: raise e lines = contents.splitlines() for line in lines: # example: 'n : (\\x.(<word> x), (v-or))' # lambdacalc -^ linear logic -^ line = line.strip() # remove trailing newline if not len(line): continue # skip empty lines if line[0] == "#": continue # skip commented out lines parts = line.split( " : ", 2 ) # ['verb', '(\\x.(<word> x), ( subj -o f ))', '[subj]'] glue_formulas = [] paren_count = 0 tuple_start = 0 tuple_comma = 0 relationships = None if len(parts) > 1: for (i, c) in enumerate(parts[1]): if c == "(": if paren_count == 0: # if it's the first '(' of a tuple tuple_start = i + 1 # then save the index paren_count += 1 elif c == ")": paren_count -= 1 if paren_count == 0: # if it's the last ')' of a tuple meaning_term = parts[1][ tuple_start:tuple_comma ] # '\\x.(<word> x)' glue_term = parts[1][tuple_comma + 1 : i] # '(v-r)' glue_formulas.append( [meaning_term, glue_term] ) # add the GlueFormula to the list elif c == ",": if ( paren_count == 1 ): # if it's a comma separating the parts of the tuple tuple_comma = i # then save the index elif c == "#": # skip comments at the ends of lines if ( paren_count != 0 ): # if the line hasn't parsed correctly so far raise RuntimeError( "Formula syntax is incorrect for entry " + line ) break # break to the next line if len(parts) > 2: # if there is a relationship entry at the end rel_start = parts[2].index("[") + 1 rel_end = parts[2].index("]") if rel_start == rel_end: relationships = frozenset() else: relationships = frozenset( r.strip() for r in parts[2][rel_start:rel_end].split(",") ) try: start_inheritance = parts[0].index("(") end_inheritance = parts[0].index(")") sem = parts[0][:start_inheritance].strip() supertype = parts[0][start_inheritance + 1 : end_inheritance] except: sem = parts[0].strip() supertype = None if sem not in self: self[sem] = {} if ( relationships is None ): # if not specified for a specific relationship set # add all relationship entries for parents if supertype: for rels in self[supertype]: if rels not in self[sem]: self[sem][rels] = [] glue = self[supertype][rels] self[sem][rels].extend(glue) self[sem][rels].extend( glue_formulas ) # add the glue formulas to every rel entry else: if None not in self[sem]: self[sem][None] = [] self[sem][None].extend( glue_formulas ) # add the glue formulas to every rel entry else: if relationships not in self[sem]: self[sem][relationships] = [] if supertype: self[sem][relationships].extend(self[supertype][relationships]) self[sem][relationships].extend( glue_formulas ) # add the glue entry to the dictionary
def __str__(self): accum = "" for pos in self: str_pos = "%s" % pos for relset in self[pos]: i = 1 for gf in self[pos][relset]: if i == 1: accum += str_pos + ": " else: accum += " " * (len(str_pos) + 2) accum += "%s" % gf if relset and i == len(self[pos][relset]): accum += " : %s" % relset accum += "\n" i += 1 return accum
[docs] def to_glueformula_list(self, depgraph, node=None, counter=None, verbose=False): if node is None: # TODO: should it be depgraph.root? Is this code tested? top = depgraph.nodes[0] depList = list(chain.from_iterable(top["deps"].values())) root = depgraph.nodes[depList[0]] return self.to_glueformula_list(depgraph, root, Counter(), verbose) glueformulas = self.lookup(node, depgraph, counter) for dep_idx in chain.from_iterable(node["deps"].values()): dep = depgraph.nodes[dep_idx] glueformulas.extend( self.to_glueformula_list(depgraph, dep, counter, verbose) ) return glueformulas
[docs] def lookup(self, node, depgraph, counter): semtype_names = self.get_semtypes(node) semtype = None for name in semtype_names: if name in self: semtype = self[name] break if semtype is None: # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word) return [] self.add_missing_dependencies(node, depgraph) lookup = self._lookup_semtype_option(semtype, node, depgraph) if not len(lookup): raise KeyError( "There is no GlueDict entry for sem type of '%s' " "with tag '%s', and rel '%s'" % (node["word"], node["tag"], node["rel"]) ) return self.get_glueformulas_from_semtype_entry( lookup, node["word"], node, depgraph, counter )
[docs] def add_missing_dependencies(self, node, depgraph): rel = node["rel"].lower() if rel == "main": headnode = depgraph.nodes[node["head"]] subj = self.lookup_unique("subj", headnode, depgraph) relation = subj["rel"] node["deps"].setdefault(relation, []) node["deps"][relation].append(subj["address"])
# node['deps'].append(subj['address']) def _lookup_semtype_option(self, semtype, node, depgraph): relationships = frozenset( depgraph.nodes[dep]["rel"].lower() for dep in chain.from_iterable(node["deps"].values()) if depgraph.nodes[dep]["rel"].lower() not in OPTIONAL_RELATIONSHIPS ) try: lookup = semtype[relationships] except KeyError: # An exact match is not found, so find the best match where # 'best' is defined as the glue entry whose relationship set has the # most relations of any possible relationship set that is a subset # of the actual depgraph best_match = frozenset() for relset_option in set(semtype) - set([None]): if ( len(relset_option) > len(best_match) and relset_option < relationships ): best_match = relset_option if not best_match: if None in semtype: best_match = None else: return None lookup = semtype[best_match] return lookup
[docs] def get_semtypes(self, node): """ Based on the node, return a list of plausible semtypes in order of plausibility. """ rel = node["rel"].lower() word = node["word"].lower() if rel == "spec": if word in SPEC_SEMTYPES: return [SPEC_SEMTYPES[word]] else: return [SPEC_SEMTYPES["default"]] elif rel in ["nmod", "vmod"]: return [node["tag"], rel] else: return [node["tag"]]
[docs] def get_glueformulas_from_semtype_entry( self, lookup, word, node, depgraph, counter ): glueformulas = [] glueFormulaFactory = self.get_GlueFormula_factory() for meaning, glue in lookup: gf = glueFormulaFactory(self.get_meaning_formula(meaning, word), glue) if not len(glueformulas): gf.word = word else: gf.word = "%s%s" % (word, len(glueformulas) + 1) gf.glue = self.initialize_labels(gf.glue, node, depgraph, counter.get()) glueformulas.append(gf) return glueformulas
[docs] def get_meaning_formula(self, generic, word): """ :param generic: A meaning formula string containing the parameter "<word>" :param word: The actual word to be replace "<word>" """ word = word.replace(".", "") return generic.replace("<word>", word)
[docs] def initialize_labels(self, expr, node, depgraph, unique_index): if isinstance(expr, linearlogic.AtomicExpression): name = self.find_label_name(expr.name.lower(), node, depgraph, unique_index) if name[0].isupper(): return linearlogic.VariableExpression(name) else: return linearlogic.ConstantExpression(name) else: return linearlogic.ImpExpression( self.initialize_labels(expr.antecedent, node, depgraph, unique_index), self.initialize_labels(expr.consequent, node, depgraph, unique_index), )
[docs] def find_label_name(self, name, node, depgraph, unique_index): try: dot = name.index(".") before_dot = name[:dot] after_dot = name[dot + 1 :] if before_dot == "super": return self.find_label_name( after_dot, depgraph.nodes[node["head"]], depgraph, unique_index ) else: return self.find_label_name( after_dot, self.lookup_unique(before_dot, node, depgraph), depgraph, unique_index, ) except ValueError: lbl = self.get_label(node) if name == "f": return lbl elif name == "v": return "%sv" % lbl elif name == "r": return "%sr" % lbl elif name == "super": return self.get_label(depgraph.nodes[node["head"]]) elif name == "var": return "%s%s" % (lbl.upper(), unique_index) elif name == "a": return self.get_label(self.lookup_unique("conja", node, depgraph)) elif name == "b": return self.get_label(self.lookup_unique("conjb", node, depgraph)) else: return self.get_label(self.lookup_unique(name, node, depgraph))
[docs] def get_label(self, node): """ Pick an alphabetic character as identifier for an entity in the model. :param value: where to index into the list of characters :type value: int """ value = node["address"] letter = [ "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "a", "b", "c", "d", "e", ][value - 1] num = int(value) // 26 if num > 0: return letter + str(num) else: return letter
[docs] def lookup_unique(self, rel, node, depgraph): """ Lookup 'key'. There should be exactly one item in the associated relation. """ deps = [ depgraph.nodes[dep] for dep in chain.from_iterable(node["deps"].values()) if depgraph.nodes[dep]["rel"].lower() == rel.lower() ] if len(deps) == 0: raise KeyError("'%s' doesn't contain a feature '%s'" % (node["word"], rel)) elif len(deps) > 1: raise KeyError( "'%s' should only have one feature '%s'" % (node["word"], rel) ) else: return deps[0]
[docs] def get_GlueFormula_factory(self): return GlueFormula
[docs]class Glue(object): def __init__( self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False ): self.verbose = verbose self.remove_duplicates = remove_duplicates self.depparser = depparser from nltk import Prover9 self.prover = Prover9() if semtype_file: self.semtype_file = semtype_file else: self.semtype_file = os.path.join( "grammars", "sample_grammars", "glue.semtype" )
[docs] def train_depparser(self, depgraphs=None): if depgraphs: self.depparser.train(depgraphs) else: self.depparser.train_from_file( nltk.data.find( os.path.join("grammars", "sample_grammars", "glue_train.conll") ) )
[docs] def parse_to_meaning(self, sentence): readings = [] for agenda in self.parse_to_compiled(sentence): readings.extend(self.get_readings(agenda)) return readings
[docs] def get_readings(self, agenda): readings = [] agenda_length = len(agenda) atomics = dict() nonatomics = dict() while agenda: # is not empty cur = agenda.pop() glue_simp = cur.glue.simplify() if isinstance( glue_simp, linearlogic.ImpExpression ): # if cur.glue is non-atomic for key in atomics: try: if isinstance(cur.glue, linearlogic.ApplicationExpression): bindings = cur.glue.bindings else: bindings = linearlogic.BindingDict() glue_simp.antecedent.unify(key, bindings) for atomic in atomics[key]: if not ( cur.indices & atomic.indices ): # if the sets of indices are disjoint try: agenda.append(cur.applyto(atomic)) except linearlogic.LinearLogicApplicationException: pass except linearlogic.UnificationException: pass try: nonatomics[glue_simp.antecedent].append(cur) except KeyError: nonatomics[glue_simp.antecedent] = [cur] else: # else cur.glue is atomic for key in nonatomics: for nonatomic in nonatomics[key]: try: if isinstance( nonatomic.glue, linearlogic.ApplicationExpression ): bindings = nonatomic.glue.bindings else: bindings = linearlogic.BindingDict() glue_simp.unify(key, bindings) if not ( cur.indices & nonatomic.indices ): # if the sets of indices are disjoint try: agenda.append(nonatomic.applyto(cur)) except linearlogic.LinearLogicApplicationException: pass except linearlogic.UnificationException: pass try: atomics[glue_simp].append(cur) except KeyError: atomics[glue_simp] = [cur] for entry in atomics: for gf in atomics[entry]: if len(gf.indices) == agenda_length: self._add_to_reading_list(gf, readings) for entry in nonatomics: for gf in nonatomics[entry]: if len(gf.indices) == agenda_length: self._add_to_reading_list(gf, readings) return readings
def _add_to_reading_list(self, glueformula, reading_list): add_reading = True if self.remove_duplicates: for reading in reading_list: try: if reading.equiv(glueformula.meaning, self.prover): add_reading = False break except Exception as e: # if there is an exception, the syntax of the formula # may not be understandable by the prover, so don't # throw out the reading. print("Error when checking logical equality of statements", e) if add_reading: reading_list.append(glueformula.meaning)
[docs] def parse_to_compiled(self, sentence): gfls = [self.depgraph_to_glue(dg) for dg in self.dep_parse(sentence)] return [self.gfl_to_compiled(gfl) for gfl in gfls]
[docs] def dep_parse(self, sentence): """ Return a dependency graph for the sentence. :param sentence: the sentence to be parsed :type sentence: list(str) :rtype: DependencyGraph """ # Lazy-initialize the depparser if self.depparser is None: from nltk.parse import MaltParser self.depparser = MaltParser(tagger=self.get_pos_tagger()) if not self.depparser._trained: self.train_depparser() return self.depparser.parse(sentence, verbose=self.verbose)
[docs] def depgraph_to_glue(self, depgraph): return self.get_glue_dict().to_glueformula_list(depgraph)
[docs] def get_glue_dict(self): return GlueDict(self.semtype_file)
[docs] def gfl_to_compiled(self, gfl): index_counter = Counter() return_list = [] for gf in gfl: return_list.extend(gf.compile(index_counter)) if self.verbose: print("Compiled Glue Premises:") for cgf in return_list: print(cgf) return return_list
[docs] def get_pos_tagger(self): from nltk.corpus import brown regexp_tagger = RegexpTagger( [ (r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers (r"(The|the|A|a|An|an)$", "AT"), # articles (r".*able$", "JJ"), # adjectives (r".*ness$", "NN"), # nouns formed from adjectives (r".*ly$", "RB"), # adverbs (r".*s$", "NNS"), # plural nouns (r".*ing$", "VBG"), # gerunds (r".*ed$", "VBD"), # past tense verbs (r".*", "NN"), # nouns (default) ] ) brown_train = brown.tagged_sents(categories="news") unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) # Override particular words main_tagger = RegexpTagger( [(r"(A|a|An|an)$", "ex_quant"), (r"(Every|every|All|all)$", "univ_quant")], backoff=trigram_tagger, ) return main_tagger
[docs]class DrtGlueFormula(GlueFormula): def __init__(self, meaning, glue, indices=None): if not indices: indices = set() if isinstance(meaning, str): self.meaning = drt.DrtExpression.fromstring(meaning) elif isinstance(meaning, drt.DrtExpression): self.meaning = meaning else: raise RuntimeError( "Meaning term neither string or expression: %s, %s" % (meaning, meaning.__class__) ) if isinstance(glue, str): self.glue = linearlogic.LinearLogicParser().parse(glue) elif isinstance(glue, linearlogic.Expression): self.glue = glue else: raise RuntimeError( "Glue term neither string or expression: %s, %s" % (glue, glue.__class__) ) self.indices = indices
[docs] def make_VariableExpression(self, name): return drt.DrtVariableExpression(name)
[docs] def make_LambdaExpression(self, variable, term): return drt.DrtLambdaExpression(variable, term)
[docs]class DrtGlueDict(GlueDict):
[docs] def get_GlueFormula_factory(self): return DrtGlueFormula
[docs]class DrtGlue(Glue): def __init__( self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False ): if not semtype_file: semtype_file = os.path.join( "grammars", "sample_grammars", "drt_glue.semtype" ) Glue.__init__(self, semtype_file, remove_duplicates, depparser, verbose)
[docs] def get_glue_dict(self): return DrtGlueDict(self.semtype_file)
[docs]def demo(show_example=-1): from nltk.parse import MaltParser examples = [ "David sees Mary", "David eats a sandwich", "every man chases a dog", "every man believes a dog sleeps", "John gives David a sandwich", "John chases himself", ] # 'John persuades David to order a pizza', # 'John tries to go', # 'John tries to find a unicorn', # 'John seems to vanish', # 'a unicorn seems to approach', # 'every big cat leaves', # 'every gray cat leaves', # 'every big gray cat leaves', # 'a former senator leaves', print("============== DEMO ==============") tagger = RegexpTagger( [ ("^(David|Mary|John)$", "NNP"), ( "^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$", "VB", ), ("^(go|order|vanish|find|approach)$", "VB"), ("^(a)$", "ex_quant"), ("^(every)$", "univ_quant"), ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"), ("^(big|gray|former)$", "JJ"), ("^(him|himself)$", "PRP"), ] ) depparser = MaltParser(tagger=tagger) glue = Glue(depparser=depparser, verbose=False) for (i, sentence) in enumerate(examples): if i == show_example or show_example == -1: print("[[[Example %s]]] %s" % (i, sentence)) for reading in glue.parse_to_meaning(sentence.split()): print(reading.simplify()) print("")
if __name__ == "__main__": demo()