Source code for nltk.corpus.reader.wordlist

# Natural Language Toolkit: Word List Corpus Reader
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
#         Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tokenize import line_tokenize


[docs]class WordListCorpusReader(CorpusReader): """ List of words, one per line. Blank lines are ignored. """
[docs] def words(self, fileids=None, ignore_lines_startswith="\n"): return [ line for line in line_tokenize(self.raw(fileids)) if not line.startswith(ignore_lines_startswith) ]
[docs]class SwadeshCorpusReader(WordListCorpusReader):
[docs] def entries(self, fileids=None): """ :return: a tuple of words for the specified fileids. """ if not fileids: fileids = self.fileids() wordlists = [self.words(f) for f in fileids] return list(zip(*wordlists))
[docs]class NonbreakingPrefixesCorpusReader(WordListCorpusReader): """ This is a class to read the nonbreaking prefixes textfiles from the Moses Machine Translation toolkit. These lists are used in the Python port of the Moses' word tokenizer. """ available_langs = { "catalan": "ca", "czech": "cs", "german": "de", "greek": "el", "english": "en", "spanish": "es", "finnish": "fi", "french": "fr", "hungarian": "hu", "icelandic": "is", "italian": "it", "latvian": "lv", "dutch": "nl", "polish": "pl", "portuguese": "pt", "romanian": "ro", "russian": "ru", "slovak": "sk", "slovenian": "sl", "swedish": "sv", "tamil": "ta", } # Also, add the lang IDs as the keys. available_langs.update({v: v for v in available_langs.values()})
[docs] def words(self, lang=None, fileids=None, ignore_lines_startswith="#"): """ This module returns a list of nonbreaking prefixes for the specified language(s). >>> from nltk.corpus import nonbreaking_prefixes as nbp >>> nbp.words('en')[:10] == [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J'] True >>> nbp.words('ta')[:5] == [u'\u0b85', u'\u0b86', u'\u0b87', u'\u0b88', u'\u0b89'] True :return: a list words for the specified language(s). """ # If *lang* in list of languages available, allocate apt fileid. # Otherwise, the function returns non-breaking prefixes for # all languages when fileids==None. if lang in self.available_langs: lang = self.available_langs[lang] fileids = ["nonbreaking_prefix." + lang] return [ line for line in line_tokenize(self.raw(fileids)) if not line.startswith(ignore_lines_startswith) ]
[docs]class UnicharsCorpusReader(WordListCorpusReader): """ This class is used to read lists of characters from the Perl Unicode Properties (see https://perldoc.perl.org/perluniprops.html). The files in the perluniprop.zip are extracted using the Unicode::Tussle module from https://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm """ # These are categories similar to the Perl Unicode Properties available_categories = [ "Close_Punctuation", "Currency_Symbol", "IsAlnum", "IsAlpha", "IsLower", "IsN", "IsSc", "IsSo", "IsUpper", "Line_Separator", "Number", "Open_Punctuation", "Punctuation", "Separator", "Symbol", ]
[docs] def chars(self, category=None, fileids=None): """ This module returns a list of characters from the Perl Unicode Properties. They are very useful when porting Perl tokenizers to Python. >>> from nltk.corpus import perluniprops as pup >>> pup.chars('Open_Punctuation')[:5] == [u'(', u'[', u'{', u'\u0f3a', u'\u0f3c'] True >>> pup.chars('Currency_Symbol')[:5] == [u'$', u'\xa2', u'\xa3', u'\xa4', u'\xa5'] True >>> pup.available_categories ['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc', 'IsSo', 'IsUpper', 'Line_Separator', 'Number', 'Open_Punctuation', 'Punctuation', 'Separator', 'Symbol'] :return: a list of characters given the specific unicode character category """ if category in self.available_categories: fileids = [category + ".txt"] return list(self.raw(fileids).strip())
[docs]class MWAPPDBCorpusReader(WordListCorpusReader): """ This class is used to read the list of word pairs from the subset of lexical pairs of The Paraphrase Database (PPDB) XXXL used in the Monolingual Word Alignment (MWA) algorithm described in Sultan et al. (2014a, 2014b, 2015): - http://acl2014.org/acl2014/Q14/pdf/Q14-1017 - https://www.aclweb.org/anthology/S14-2039 - https://www.aclweb.org/anthology/S15-2027 The original source of the full PPDB corpus can be found on https://www.cis.upenn.edu/~ccb/ppdb/ :return: a list of tuples of similar lexical terms. """ mwa_ppdb_xxxl_file = "ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs"
[docs] def entries(self, fileids=mwa_ppdb_xxxl_file): """ :return: a tuple of synonym word pairs. """ return [tuple(line.split("\t")) for line in line_tokenize(self.raw(fileids))]