Source code for nltk.corpus.reader.wordlist

# Natural Language Toolkit: Word List Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
#         Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tokenize import line_tokenize



[docs]
class WordListCorpusReader(CorpusReader):
    """
    List of words, one per line.  Blank lines are ignored.
    """


[docs]
    def words(self, fileids=None, ignore_lines_startswith="\n"):
        return [
            line
            for line in line_tokenize(self.raw(fileids))
            if not line.startswith(ignore_lines_startswith)
        ]





[docs]
class SwadeshCorpusReader(WordListCorpusReader):

[docs]
    def entries(self, fileids=None):
        """
        :return: a tuple of words for the specified fileids.
        """
        if not fileids:
            fileids = self.fileids()

        wordlists = [self.words(f) for f in fileids]
        return list(zip(*wordlists))





[docs]
class NonbreakingPrefixesCorpusReader(WordListCorpusReader):
    """
    This is a class to read the nonbreaking prefixes textfiles from the
    Moses Machine Translation toolkit. These lists are used in the Python port
    of the Moses' word tokenizer.
    """

    available_langs = {
        "catalan": "ca",
        "czech": "cs",
        "german": "de",
        "greek": "el",
        "english": "en",
        "spanish": "es",
        "finnish": "fi",
        "french": "fr",
        "hungarian": "hu",
        "icelandic": "is",
        "italian": "it",
        "latvian": "lv",
        "dutch": "nl",
        "polish": "pl",
        "portuguese": "pt",
        "romanian": "ro",
        "russian": "ru",
        "slovak": "sk",
        "slovenian": "sl",
        "swedish": "sv",
        "tamil": "ta",
    }
    # Also, add the lang IDs as the keys.
    available_langs.update({v: v for v in available_langs.values()})


[docs]
    def words(self, lang=None, fileids=None, ignore_lines_startswith="#"):
        """
        This module returns a list of nonbreaking prefixes for the specified
        language(s).

        >>> from nltk.corpus import nonbreaking_prefixes as nbp
        >>> nbp.words('en')[:10] == [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J']
        True
        >>> nbp.words('ta')[:5] == [u'\u0b85', u'\u0b86', u'\u0b87', u'\u0b88', u'\u0b89']
        True

        :return: a list words for the specified language(s).
        """
        # If *lang* in list of languages available, allocate apt fileid.
        # Otherwise, the function returns non-breaking prefixes for
        # all languages when fileids==None.
        if lang in self.available_langs:
            lang = self.available_langs[lang]
            fileids = ["nonbreaking_prefix." + lang]
        return [
            line
            for line in line_tokenize(self.raw(fileids))
            if not line.startswith(ignore_lines_startswith)
        ]





[docs]
class UnicharsCorpusReader(WordListCorpusReader):
    """
    This class is used to read lists of characters from the Perl Unicode
    Properties (see https://perldoc.perl.org/perluniprops.html).
    The files in the perluniprop.zip are extracted using the Unicode::Tussle
    module from https://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm
    """

    # These are categories similar to the Perl Unicode Properties
    available_categories = [
        "Close_Punctuation",
        "Currency_Symbol",
        "IsAlnum",
        "IsAlpha",
        "IsLower",
        "IsN",
        "IsSc",
        "IsSo",
        "IsUpper",
        "Line_Separator",
        "Number",
        "Open_Punctuation",
        "Punctuation",
        "Separator",
        "Symbol",
    ]


[docs]
    def chars(self, category=None, fileids=None):
        """
        This module returns a list of characters from  the Perl Unicode Properties.
        They are very useful when porting Perl tokenizers to Python.

        >>> from nltk.corpus import perluniprops as pup
        >>> pup.chars('Open_Punctuation')[:5] == [u'(', u'[', u'{', u'\u0f3a', u'\u0f3c']
        True
        >>> pup.chars('Currency_Symbol')[:5] == [u'$', u'\xa2', u'\xa3', u'\xa4', u'\xa5']
        True
        >>> pup.available_categories
        ['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc', 'IsSo', 'IsUpper', 'Line_Separator', 'Number', 'Open_Punctuation', 'Punctuation', 'Separator', 'Symbol']

        :return: a list of characters given the specific unicode character category
        """
        if category in self.available_categories:
            fileids = [category + ".txt"]
        return list(self.raw(fileids).strip())





[docs]
class MWAPPDBCorpusReader(WordListCorpusReader):
    """
    This class is used to read the list of word pairs from the subset of lexical
    pairs of The Paraphrase Database (PPDB) XXXL used in the Monolingual Word
    Alignment (MWA) algorithm described in Sultan et al. (2014a, 2014b, 2015):

     - http://acl2014.org/acl2014/Q14/pdf/Q14-1017
     - https://www.aclweb.org/anthology/S14-2039
     - https://www.aclweb.org/anthology/S15-2027

    The original source of the full PPDB corpus can be found on
    https://www.cis.upenn.edu/~ccb/ppdb/

    :return: a list of tuples of similar lexical terms.
    """

    mwa_ppdb_xxxl_file = "ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs"


[docs]
    def entries(self, fileids=mwa_ppdb_xxxl_file):
        """
        :return: a tuple of synonym word pairs.
        """
        return [tuple(line.split("\t")) for line in line_tokenize(self.raw(fileids))]