Source code for nltk.corpus.reader.sinica_treebank

# Natural Language Toolkit: Sinica Treebank Reader
# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <>
# URL: <>
# For license information, see LICENSE.TXT

Sinica Treebank Corpus Sample

10,000 parsed sentences, drawn from the Academia Sinica Balanced
Corpus of Modern Chinese.  Parse tree notation is based on
Information-based Case Grammar.  Tagset documentation is available

Language and Knowledge Processing Group, Institute of Information
Science, Academia Sinica

It is distributed with the Natural Language Toolkit under the terms of
the Creative Commons Attribution-NonCommercial-ShareAlike License


Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999)
The Construction of Sinica Treebank. Computational Linguistics and
Chinese Language Processing, 4, pp 87-104.

Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming
Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria,
Annotation Guidelines, and On-line Interface. Proceedings of 2nd
Chinese Language Processing Workshop, Association for Computational

Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar
Extraction, Proceedings of IJCNLP-04, pp560-565.

import os
import re

from nltk.tree import sinica_parse
from nltk.tag import map_tag

from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *

IDENTIFIER = re.compile(r'^#\S+\s')
APPENDIX = re.compile(r'(?<=\))#.*$')
TAGWORD = re.compile(r':([^:()|]+):([^:()|]+)')
WORD = re.compile(r':[^:()|]+:([^:()|]+)')

[docs]class SinicaTreebankCorpusReader(SyntaxCorpusReader): """ Reader for the sinica treebank. """ def _read_block(self, stream): sent = stream.readline() sent = IDENTIFIER.sub('', sent) sent = APPENDIX.sub('', sent) return [sent] def _parse(self, sent): return sinica_parse(sent) def _tag(self, sent, tagset=None): tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(sent)] if tagset and tagset != self._tagset: tagged_sent = [(w, map_tag(self._tagset, tagset, t)) for (w,t) in tagged_sent] return tagged_sent def _word(self, sent): return WORD.findall(sent)