Source code for nltk.sem.glue

# Natural Language Toolkit: Glue Semantics
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
# Copyright (C) 2001-2023 NLTK Project
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

import os
from itertools import chain

import nltk
from nltk.internals import Counter
from nltk.sem import drt, linearlogic
from nltk.sem.logic import (
    AbstractVariableExpression,
    Expression,
    LambdaExpression,
    Variable,
    VariableExpression,
)
from nltk.tag import BigramTagger, RegexpTagger, TrigramTagger, UnigramTagger

SPEC_SEMTYPES = {
    "a": "ex_quant",
    "an": "ex_quant",
    "every": "univ_quant",
    "the": "def_art",
    "no": "no_quant",
    "default": "ex_quant",
}

OPTIONAL_RELATIONSHIPS = ["nmod", "vmod", "punct"]


[docs]class GlueFormula:
[docs]    def __init__(self, meaning, glue, indices=None):
        if not indices:
            indices = set()

        if isinstance(meaning, str):
            self.meaning = Expression.fromstring(meaning)
        elif isinstance(meaning, Expression):
            self.meaning = meaning
        else:
            raise RuntimeError(
                "Meaning term neither string or expression: %s, %s"
                % (meaning, meaning.__class__)
            )

        if isinstance(glue, str):
            self.glue = linearlogic.LinearLogicParser().parse(glue)
        elif isinstance(glue, linearlogic.Expression):
            self.glue = glue
        else:
            raise RuntimeError(
                "Glue term neither string or expression: %s, %s"
                % (glue, glue.__class__)
            )

        self.indices = indices

[docs]    def applyto(self, arg):
        """self = (\\x.(walk x), (subj -o f))
        arg  = (john        ,  subj)
        returns ((walk john),          f)
        """
        if self.indices & arg.indices:  # if the sets are NOT disjoint
            raise linearlogic.LinearLogicApplicationException(
                f"'{self}' applied to '{arg}'.  Indices are not disjoint."
            )
        else:  # if the sets ARE disjoint
            return_indices = self.indices | arg.indices

        try:
            return_glue = linearlogic.ApplicationExpression(
                self.glue, arg.glue, arg.indices
            )
        except linearlogic.LinearLogicApplicationException as e:
            raise linearlogic.LinearLogicApplicationException(
                f"'{self.simplify()}' applied to '{arg.simplify()}'"
            ) from e

        arg_meaning_abstracted = arg.meaning
        if return_indices:
            for dep in self.glue.simplify().antecedent.dependencies[
                ::-1
            ]:  # if self.glue is (A -o B), dep is in A.dependencies
                arg_meaning_abstracted = self.make_LambdaExpression(
                    Variable("v%s" % dep), arg_meaning_abstracted
                )
        return_meaning = self.meaning.applyto(arg_meaning_abstracted)

        return self.__class__(return_meaning, return_glue, return_indices)

[docs]    def make_VariableExpression(self, name):
        return VariableExpression(name)

[docs]    def make_LambdaExpression(self, variable, term):
        return LambdaExpression(variable, term)

[docs]    def lambda_abstract(self, other):
        assert isinstance(other, GlueFormula)
        assert isinstance(other.meaning, AbstractVariableExpression)
        return self.__class__(
            self.make_LambdaExpression(other.meaning.variable, self.meaning),
            linearlogic.ImpExpression(other.glue, self.glue),
        )

[docs]    def compile(self, counter=None):
        """From Iddo Lev's PhD Dissertation p108-109"""
        if not counter:
            counter = Counter()
        (compiled_glue, new_forms) = self.glue.simplify().compile_pos(
            counter, self.__class__
        )
        return new_forms + [
            self.__class__(self.meaning, compiled_glue, {counter.get()})
        ]

[docs]    def simplify(self):
        return self.__class__(
            self.meaning.simplify(), self.glue.simplify(), self.indices
        )

    def __eq__(self, other):
        return (
            self.__class__ == other.__class__
            and self.meaning == other.meaning
            and self.glue == other.glue
        )

    def __ne__(self, other):
        return not self == other

    # sorting for use in doctests which must be deterministic
    def __lt__(self, other):
        return str(self) < str(other)

    def __str__(self):
        assert isinstance(self.indices, set)
        accum = f"{self.meaning} : {self.glue}"
        if self.indices:
            accum += (
                " : {" + ", ".join(str(index) for index in sorted(self.indices)) + "}"
            )
        return accum

    def __repr__(self):
        return "%s" % self


[docs]class GlueDict(dict):
[docs]    def __init__(self, filename, encoding=None):
        self.filename = filename
        self.file_encoding = encoding
        self.read_file()

[docs]    def read_file(self, empty_first=True):
        if empty_first:
            self.clear()

        try:
            contents = nltk.data.load(
                self.filename, format="text", encoding=self.file_encoding
            )
            # TODO: the above can't handle zip files, but this should anyway be fixed in nltk.data.load()
        except LookupError as e:
            try:
                contents = nltk.data.load(
                    "file:" + self.filename, format="text", encoding=self.file_encoding
                )
            except LookupError:
                raise e
        lines = contents.splitlines()

        for line in lines:  # example: 'n : (\\x.(<word> x), (v-or))'
            #     lambdacalc -^  linear logic -^
            line = line.strip()  # remove trailing newline
            if not len(line):
                continue  # skip empty lines
            if line[0] == "#":
                continue  # skip commented out lines

            parts = line.split(
                " : ", 2
            )  # ['verb', '(\\x.(<word> x), ( subj -o f ))', '[subj]']

            glue_formulas = []
            paren_count = 0
            tuple_start = 0
            tuple_comma = 0

            relationships = None

            if len(parts) > 1:
                for (i, c) in enumerate(parts[1]):
                    if c == "(":
                        if paren_count == 0:  # if it's the first '(' of a tuple
                            tuple_start = i + 1  # then save the index
                        paren_count += 1
                    elif c == ")":
                        paren_count -= 1
                        if paren_count == 0:  # if it's the last ')' of a tuple
                            meaning_term = parts[1][
                                tuple_start:tuple_comma
                            ]  # '\\x.(<word> x)'
                            glue_term = parts[1][tuple_comma + 1 : i]  # '(v-r)'
                            glue_formulas.append(
                                [meaning_term, glue_term]
                            )  # add the GlueFormula to the list
                    elif c == ",":
                        if (
                            paren_count == 1
                        ):  # if it's a comma separating the parts of the tuple
                            tuple_comma = i  # then save the index
                    elif c == "#":  # skip comments at the ends of lines
                        if (
                            paren_count != 0
                        ):  # if the line hasn't parsed correctly so far
                            raise RuntimeError(
                                "Formula syntax is incorrect for entry " + line
                            )
                        break  # break to the next line

            if len(parts) > 2:  # if there is a relationship entry at the end
                rel_start = parts[2].index("[") + 1
                rel_end = parts[2].index("]")
                if rel_start == rel_end:
                    relationships = frozenset()
                else:
                    relationships = frozenset(
                        r.strip() for r in parts[2][rel_start:rel_end].split(",")
                    )

            try:
                start_inheritance = parts[0].index("(")
                end_inheritance = parts[0].index(")")
                sem = parts[0][:start_inheritance].strip()
                supertype = parts[0][start_inheritance + 1 : end_inheritance]
            except:
                sem = parts[0].strip()
                supertype = None

            if sem not in self:
                self[sem] = {}

            if (
                relationships is None
            ):  # if not specified for a specific relationship set
                # add all relationship entries for parents
                if supertype:
                    for rels in self[supertype]:
                        if rels not in self[sem]:
                            self[sem][rels] = []
                        glue = self[supertype][rels]
                        self[sem][rels].extend(glue)
                        self[sem][rels].extend(
                            glue_formulas
                        )  # add the glue formulas to every rel entry
                else:
                    if None not in self[sem]:
                        self[sem][None] = []
                    self[sem][None].extend(
                        glue_formulas
                    )  # add the glue formulas to every rel entry
            else:
                if relationships not in self[sem]:
                    self[sem][relationships] = []
                if supertype:
                    self[sem][relationships].extend(self[supertype][relationships])
                self[sem][relationships].extend(
                    glue_formulas
                )  # add the glue entry to the dictionary

    def __str__(self):
        accum = ""
        for pos in self:
            str_pos = "%s" % pos
            for relset in self[pos]:
                i = 1
                for gf in self[pos][relset]:
                    if i == 1:
                        accum += str_pos + ": "
                    else:
                        accum += " " * (len(str_pos) + 2)
                    accum += "%s" % gf
                    if relset and i == len(self[pos][relset]):
                        accum += " : %s" % relset
                    accum += "\n"
                    i += 1
        return accum

[docs]    def to_glueformula_list(self, depgraph, node=None, counter=None, verbose=False):
        if node is None:
            # TODO: should it be depgraph.root? Is this code tested?
            top = depgraph.nodes[0]
            depList = list(chain.from_iterable(top["deps"].values()))
            root = depgraph.nodes[depList[0]]

            return self.to_glueformula_list(depgraph, root, Counter(), verbose)

        glueformulas = self.lookup(node, depgraph, counter)
        for dep_idx in chain.from_iterable(node["deps"].values()):
            dep = depgraph.nodes[dep_idx]
            glueformulas.extend(
                self.to_glueformula_list(depgraph, dep, counter, verbose)
            )
        return glueformulas

[docs]    def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" % (node["word"], node["tag"], node["rel"])
            )

        return self.get_glueformulas_from_semtype_entry(
            lookup, node["word"], node, depgraph, counter
        )

[docs]    def add_missing_dependencies(self, node, depgraph):
        rel = node["rel"].lower()

        if rel == "main":
            headnode = depgraph.nodes[node["head"]]
            subj = self.lookup_unique("subj", headnode, depgraph)
            relation = subj["rel"]
            node["deps"].setdefault(relation, [])
            node["deps"][relation].append(subj["address"])
            # node['deps'].append(subj['address'])

    def _lookup_semtype_option(self, semtype, node, depgraph):
        relationships = frozenset(
            depgraph.nodes[dep]["rel"].lower()
            for dep in chain.from_iterable(node["deps"].values())
            if depgraph.nodes[dep]["rel"].lower() not in OPTIONAL_RELATIONSHIPS
        )

        try:
            lookup = semtype[relationships]
        except KeyError:
            # An exact match is not found, so find the best match where
            # 'best' is defined as the glue entry whose relationship set has the
            # most relations of any possible relationship set that is a subset
            # of the actual depgraph
            best_match = frozenset()
            for relset_option in set(semtype) - {None}:
                if (
                    len(relset_option) > len(best_match)
                    and relset_option < relationships
                ):
                    best_match = relset_option
            if not best_match:
                if None in semtype:
                    best_match = None
                else:
                    return None
            lookup = semtype[best_match]

        return lookup

[docs]    def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node["rel"].lower()
        word = node["word"].lower()

        if rel == "spec":
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES["default"]]
        elif rel in ["nmod", "vmod"]:
            return [node["tag"], rel]
        else:
            return [node["tag"]]

[docs]    def get_glueformulas_from_semtype_entry(
        self, lookup, word, node, depgraph, counter
    ):
        glueformulas = []

        glueFormulaFactory = self.get_GlueFormula_factory()
        for meaning, glue in lookup:
            gf = glueFormulaFactory(self.get_meaning_formula(meaning, word), glue)
            if not len(glueformulas):
                gf.word = word
            else:
                gf.word = f"{word}{len(glueformulas) + 1}"

            gf.glue = self.initialize_labels(gf.glue, node, depgraph, counter.get())

            glueformulas.append(gf)
        return glueformulas

[docs]    def get_meaning_formula(self, generic, word):
        """
        :param generic: A meaning formula string containing the
            parameter "<word>"
        :param word: The actual word to be replace "<word>"
        """
        word = word.replace(".", "")
        return generic.replace("<word>", word)

[docs]    def initialize_labels(self, expr, node, depgraph, unique_index):
        if isinstance(expr, linearlogic.AtomicExpression):
            name = self.find_label_name(expr.name.lower(), node, depgraph, unique_index)
            if name[0].isupper():
                return linearlogic.VariableExpression(name)
            else:
                return linearlogic.ConstantExpression(name)
        else:
            return linearlogic.ImpExpression(
                self.initialize_labels(expr.antecedent, node, depgraph, unique_index),
                self.initialize_labels(expr.consequent, node, depgraph, unique_index),
            )

[docs]    def find_label_name(self, name, node, depgraph, unique_index):
        try:
            dot = name.index(".")

            before_dot = name[:dot]
            after_dot = name[dot + 1 :]
            if before_dot == "super":
                return self.find_label_name(
                    after_dot, depgraph.nodes[node["head"]], depgraph, unique_index
                )
            else:
                return self.find_label_name(
                    after_dot,
                    self.lookup_unique(before_dot, node, depgraph),
                    depgraph,
                    unique_index,
                )
        except ValueError:
            lbl = self.get_label(node)
            if name == "f":
                return lbl
            elif name == "v":
                return "%sv" % lbl
            elif name == "r":
                return "%sr" % lbl
            elif name == "super":
                return self.get_label(depgraph.nodes[node["head"]])
            elif name == "var":
                return f"{lbl.upper()}{unique_index}"
            elif name == "a":
                return self.get_label(self.lookup_unique("conja", node, depgraph))
            elif name == "b":
                return self.get_label(self.lookup_unique("conjb", node, depgraph))
            else:
                return self.get_label(self.lookup_unique(name, node, depgraph))

[docs]    def get_label(self, node):
        """
        Pick an alphabetic character as identifier for an entity in the model.

        :param value: where to index into the list of characters
        :type value: int
        """
        value = node["address"]

        letter = [
            "f",
            "g",
            "h",
            "i",
            "j",
            "k",
            "l",
            "m",
            "n",
            "o",
            "p",
            "q",
            "r",
            "s",
            "t",
            "u",
            "v",
            "w",
            "x",
            "y",
            "z",
            "a",
            "b",
            "c",
            "d",
            "e",
        ][value - 1]
        num = int(value) // 26
        if num > 0:
            return letter + str(num)
        else:
            return letter

[docs]    def lookup_unique(self, rel, node, depgraph):
        """
        Lookup 'key'. There should be exactly one item in the associated relation.
        """
        deps = [
            depgraph.nodes[dep]
            for dep in chain.from_iterable(node["deps"].values())
            if depgraph.nodes[dep]["rel"].lower() == rel.lower()
        ]

        if len(deps) == 0:
            raise KeyError(
                "'{}' doesn't contain a feature '{}'".format(node["word"], rel)
            )
        elif len(deps) > 1:
            raise KeyError(
                "'{}' should only have one feature '{}'".format(node["word"], rel)
            )
        else:
            return deps[0]

[docs]    def get_GlueFormula_factory(self):
        return GlueFormula


[docs]class Glue:
[docs]    def __init__(
        self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False
    ):
        self.verbose = verbose
        self.remove_duplicates = remove_duplicates
        self.depparser = depparser

        from nltk import Prover9

        self.prover = Prover9()

        if semtype_file:
            self.semtype_file = semtype_file
        else:
            self.semtype_file = os.path.join(
                "grammars", "sample_grammars", "glue.semtype"
            )

[docs]    def train_depparser(self, depgraphs=None):
        if depgraphs:
            self.depparser.train(depgraphs)
        else:
            self.depparser.train_from_file(
                nltk.data.find(
                    os.path.join("grammars", "sample_grammars", "glue_train.conll")
                )
            )

[docs]    def parse_to_meaning(self, sentence):
        readings = []
        for agenda in self.parse_to_compiled(sentence):
            readings.extend(self.get_readings(agenda))
        return readings

[docs]    def get_readings(self, agenda):
        readings = []
        agenda_length = len(agenda)
        atomics = dict()
        nonatomics = dict()
        while agenda:  # is not empty
            cur = agenda.pop()
            glue_simp = cur.glue.simplify()
            if isinstance(
                glue_simp, linearlogic.ImpExpression
            ):  # if cur.glue is non-atomic
                for key in atomics:
                    try:
                        if isinstance(cur.glue, linearlogic.ApplicationExpression):
                            bindings = cur.glue.bindings
                        else:
                            bindings = linearlogic.BindingDict()
                        glue_simp.antecedent.unify(key, bindings)
                        for atomic in atomics[key]:
                            if not (
                                cur.indices & atomic.indices
                            ):  # if the sets of indices are disjoint
                                try:
                                    agenda.append(cur.applyto(atomic))
                                except linearlogic.LinearLogicApplicationException:
                                    pass
                    except linearlogic.UnificationException:
                        pass
                try:
                    nonatomics[glue_simp.antecedent].append(cur)
                except KeyError:
                    nonatomics[glue_simp.antecedent] = [cur]

            else:  # else cur.glue is atomic
                for key in nonatomics:
                    for nonatomic in nonatomics[key]:
                        try:
                            if isinstance(
                                nonatomic.glue, linearlogic.ApplicationExpression
                            ):
                                bindings = nonatomic.glue.bindings
                            else:
                                bindings = linearlogic.BindingDict()
                            glue_simp.unify(key, bindings)
                            if not (
                                cur.indices & nonatomic.indices
                            ):  # if the sets of indices are disjoint
                                try:
                                    agenda.append(nonatomic.applyto(cur))
                                except linearlogic.LinearLogicApplicationException:
                                    pass
                        except linearlogic.UnificationException:
                            pass
                try:
                    atomics[glue_simp].append(cur)
                except KeyError:
                    atomics[glue_simp] = [cur]

        for entry in atomics:
            for gf in atomics[entry]:
                if len(gf.indices) == agenda_length:
                    self._add_to_reading_list(gf, readings)
        for entry in nonatomics:
            for gf in nonatomics[entry]:
                if len(gf.indices) == agenda_length:
                    self._add_to_reading_list(gf, readings)
        return readings

    def _add_to_reading_list(self, glueformula, reading_list):
        add_reading = True
        if self.remove_duplicates:
            for reading in reading_list:
                try:
                    if reading.equiv(glueformula.meaning, self.prover):
                        add_reading = False
                        break
                except Exception as e:
                    # if there is an exception, the syntax of the formula
                    # may not be understandable by the prover, so don't
                    # throw out the reading.
                    print("Error when checking logical equality of statements", e)

        if add_reading:
            reading_list.append(glueformula.meaning)

[docs]    def parse_to_compiled(self, sentence):
        gfls = [self.depgraph_to_glue(dg) for dg in self.dep_parse(sentence)]
        return [self.gfl_to_compiled(gfl) for gfl in gfls]

[docs]    def dep_parse(self, sentence):
        """
        Return a dependency graph for the sentence.

        :param sentence: the sentence to be parsed
        :type sentence: list(str)
        :rtype: DependencyGraph
        """

        # Lazy-initialize the depparser
        if self.depparser is None:
            from nltk.parse import MaltParser

            self.depparser = MaltParser(tagger=self.get_pos_tagger())
        if not self.depparser._trained:
            self.train_depparser()
        return self.depparser.parse(sentence, verbose=self.verbose)

[docs]    def depgraph_to_glue(self, depgraph):
        return self.get_glue_dict().to_glueformula_list(depgraph)

[docs]    def get_glue_dict(self):
        return GlueDict(self.semtype_file)

[docs]    def gfl_to_compiled(self, gfl):
        index_counter = Counter()
        return_list = []
        for gf in gfl:
            return_list.extend(gf.compile(index_counter))

        if self.verbose:
            print("Compiled Glue Premises:")
            for cgf in return_list:
                print(cgf)

        return return_list

[docs]    def get_pos_tagger(self):
        from nltk.corpus import brown

        regexp_tagger = RegexpTagger(
            [
                (r"^-?[0-9]+(\.[0-9]+)?$", "CD"),  # cardinal numbers
                (r"(The|the|A|a|An|an)$", "AT"),  # articles
                (r".*able$", "JJ"),  # adjectives
                (r".*ness$", "NN"),  # nouns formed from adjectives
                (r".*ly$", "RB"),  # adverbs
                (r".*s$", "NNS"),  # plural nouns
                (r".*ing$", "VBG"),  # gerunds
                (r".*ed$", "VBD"),  # past tense verbs
                (r".*", "NN"),  # nouns (default)
            ]
        )
        brown_train = brown.tagged_sents(categories="news")
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        # Override particular words
        main_tagger = RegexpTagger(
            [(r"(A|a|An|an)$", "ex_quant"), (r"(Every|every|All|all)$", "univ_quant")],
            backoff=trigram_tagger,
        )

        return main_tagger


[docs]class DrtGlueFormula(GlueFormula):
[docs]    def __init__(self, meaning, glue, indices=None):
        if not indices:
            indices = set()

        if isinstance(meaning, str):
            self.meaning = drt.DrtExpression.fromstring(meaning)
        elif isinstance(meaning, drt.DrtExpression):
            self.meaning = meaning
        else:
            raise RuntimeError(
                "Meaning term neither string or expression: %s, %s"
                % (meaning, meaning.__class__)
            )

        if isinstance(glue, str):
            self.glue = linearlogic.LinearLogicParser().parse(glue)
        elif isinstance(glue, linearlogic.Expression):
            self.glue = glue
        else:
            raise RuntimeError(
                "Glue term neither string or expression: %s, %s"
                % (glue, glue.__class__)
            )

        self.indices = indices

[docs]    def make_VariableExpression(self, name):
        return drt.DrtVariableExpression(name)

[docs]    def make_LambdaExpression(self, variable, term):
        return drt.DrtLambdaExpression(variable, term)


[docs]class DrtGlueDict(GlueDict):
[docs]    def get_GlueFormula_factory(self):
        return DrtGlueFormula


[docs]class DrtGlue(Glue):
[docs]    def __init__(
        self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False
    ):
        if not semtype_file:
            semtype_file = os.path.join(
                "grammars", "sample_grammars", "drt_glue.semtype"
            )
        Glue.__init__(self, semtype_file, remove_duplicates, depparser, verbose)

[docs]    def get_glue_dict(self):
        return DrtGlueDict(self.semtype_file)


[docs]def demo(show_example=-1):
    from nltk.parse import MaltParser

    examples = [
        "David sees Mary",
        "David eats a sandwich",
        "every man chases a dog",
        "every man believes a dog sleeps",
        "John gives David a sandwich",
        "John chases himself",
    ]
    #                'John persuades David to order a pizza',
    #                'John tries to go',
    #                'John tries to find a unicorn',
    #                'John seems to vanish',
    #                'a unicorn seems to approach',
    #                'every big cat leaves',
    #                'every gray cat leaves',
    #                'every big gray cat leaves',
    #                'a former senator leaves',

    print("============== DEMO ==============")

    tagger = RegexpTagger(
        [
            ("^(David|Mary|John)$", "NNP"),
            (
                "^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$",
                "VB",
            ),
            ("^(go|order|vanish|find|approach)$", "VB"),
            ("^(a)$", "ex_quant"),
            ("^(every)$", "univ_quant"),
            ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"),
            ("^(big|gray|former)$", "JJ"),
            ("^(him|himself)$", "PRP"),
        ]
    )

    depparser = MaltParser(tagger=tagger)
    glue = Glue(depparser=depparser, verbose=False)

    for (i, sentence) in enumerate(examples):
        if i == show_example or show_example == -1:
            print(f"[[[Example {i}]]]  {sentence}")
            for reading in glue.parse_to_meaning(sentence.split()):
                print(reading.simplify())
            print("")


if __name__ == "__main__":
    demo()