Source code for nltk.corpus.reader.knbc

#! /usr/bin/env python
# KNB Corpus reader
# Copyright (C) 2001-2023 NLTK Project
# Author: Masato Hagiwara <hagisan@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html

import re

from nltk.corpus.reader.api import CorpusReader, SyntaxCorpusReader
from nltk.corpus.reader.util import (
    FileSystemPathPointer,
    find_corpus_fileids,
    read_blankline_block,
)
from nltk.parse import DependencyGraph

# default function to convert morphlist to str for tree representation
_morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS")


[docs]class KNBCorpusReader(SyntaxCorpusReader): """ This class implements: - ``__init__``, which specifies the location of the corpus and a method for detecting the sentence blocks in corpus files. - ``_read_block``, which reads a block from the input stream. - ``_word``, which takes a block and returns a list of list of words. - ``_tag``, which takes a block and returns a list of list of tagged words. - ``_parse``, which takes a block and returns a list of parsed sentences. The structure of tagged words: tagged_word = (word(str), tags(tuple)) tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...) Usage example >>> from nltk.corpus.util import LazyCorpusLoader >>> knbc = LazyCorpusLoader( ... 'knbc/corpus1', ... KNBCorpusReader, ... r'.*/KN.*', ... encoding='euc-jp', ... ) >>> len(knbc.sents()[0]) 9 """
[docs] def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default): """ Initialize KNBCorpusReader morphs2str is a function to convert morphlist to str for tree representation for _parse() """ SyntaxCorpusReader.__init__(self, root, fileids, encoding) self.morphs2str = morphs2str
def _read_block(self, stream): # blocks are split by blankline (or EOF) - default return read_blankline_block(stream) def _word(self, t): res = [] for line in t.splitlines(): # ignore the Bunsets headers if not re.match(r"EOS|\*|\#|\+", line): cells = line.strip().split(" ") res.append(cells[0]) return res # ignores tagset argument def _tag(self, t, tagset=None): res = [] for line in t.splitlines(): # ignore the Bunsets headers if not re.match(r"EOS|\*|\#|\+", line): cells = line.strip().split(" ") # convert cells to morph tuples res.append((cells[0], " ".join(cells[1:]))) return res def _parse(self, t): dg = DependencyGraph() i = 0 for line in t.splitlines(): if line[0] in "*+": # start of bunsetsu or tag cells = line.strip().split(" ", 3) m = re.match(r"([\-0-9]*)([ADIP])", cells[1]) assert m is not None node = dg.nodes[i] node.update({"address": i, "rel": m.group(2), "word": []}) dep_parent = int(m.group(1)) if dep_parent == -1: dg.root = node else: dg.nodes[dep_parent]["deps"].append(i) i += 1 elif line[0] != "#": # normal morph cells = line.strip().split(" ") # convert cells to morph tuples morph = cells[0], " ".join(cells[1:]) dg.nodes[i - 1]["word"].append(morph) if self.morphs2str: for node in dg.nodes.values(): node["word"] = self.morphs2str(node["word"]) return dg.tree()
###################################################################### # Demo ######################################################################
[docs]def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find("corpora/knbc/corpus1") fileids = [ f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f) ] def _knbc_fileids_sort(x): cells = x.split("-") return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader( "knbc/corpus1", KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding="euc-jp", ) print(knbc.fileids()[:10]) print("".join(knbc.words()[:100])) print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2])) knbc.morphs2str = lambda morphs: "/".join( "{}({})".format(m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS" ).encode("utf-8") print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2])) print( "\n".join( " ".join("{}/{}".format(w[0], w[1].split(" ")[2]) for w in sent) for sent in knbc.tagged_sents()[0:2] ) )
[docs]def test(): from nltk.corpus.util import LazyCorpusLoader knbc = LazyCorpusLoader( "knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp" ) assert isinstance(knbc.words()[0], str) assert isinstance(knbc.sents()[0][0], str) assert isinstance(knbc.tagged_words()[0], tuple) assert isinstance(knbc.tagged_sents()[0][0], tuple)
if __name__ == "__main__": demo()