Source code for nltk.test.unit.test_corpora

import unittest

import pytest

from nltk.corpus import (  # mwa_ppdb
    cess_cat,
    cess_esp,
    conll2007,
    floresta,
    indian,
    ptb,
    sinica_treebank,
    udhr,
)
from nltk.tree import Tree



[docs]
class TestUdhr(unittest.TestCase):

[docs]
    def test_words(self):
        for name in udhr.fileids():
            words = list(udhr.words(name))
            self.assertTrue(words)



[docs]
    def test_raw_unicode(self):
        for name in udhr.fileids():
            txt = udhr.raw(name)
            assert not isinstance(txt, bytes), name



[docs]
    def test_polish_encoding(self):
        text_pl = udhr.raw("Polish-Latin2")[:164]
        text_ppl = udhr.raw("Polish_Polski-Latin2")[:164]
        expected = """POWSZECHNA DEKLARACJA PRAW CZŁOWIEKA
[Preamble]
Trzecia Sesja Ogólnego Zgromadzenia ONZ, obradująca w Paryżu, \
uchwaliła 10 grudnia 1948 roku jednomyślnie Powszechną"""

        assert text_pl == expected, "Polish-Latin2"
        assert text_ppl == expected, "Polish_Polski-Latin2"





[docs]
class TestIndian(unittest.TestCase):

[docs]
    def test_words(self):
        words = indian.words()[:3]
        self.assertEqual(words, ["মহিষের", "সন্তান", ":"])



[docs]
    def test_tagged_words(self):
        tagged_words = indian.tagged_words()[:3]
        self.assertEqual(
            tagged_words, [("মহিষের", "NN"), ("সন্তান", "NN"), (":", "SYM")]
        )





[docs]
class TestCess(unittest.TestCase):

[docs]
    def test_catalan(self):
        words = cess_cat.words()[:15]
        txt = "El Tribunal_Suprem -Fpa- TS -Fpt- ha confirmat la condemna a quatre anys d' inhabilitació especial"
        self.assertEqual(words, txt.split())
        self.assertEqual(cess_cat.tagged_sents()[0][34][0], "càrrecs")



[docs]
    def test_esp(self):
        words = cess_esp.words()[:15]
        txt = "El grupo estatal Electricité_de_France -Fpa- EDF -Fpt- anunció hoy , jueves , la compra del"
        self.assertEqual(words, txt.split())
        self.assertEqual(cess_esp.words()[115], "años")





[docs]
class TestFloresta(unittest.TestCase):

[docs]
    def test_words(self):
        words = floresta.words()[:10]
        txt = "Um revivalismo refrescante O 7_e_Meio é um ex-libris de a"
        self.assertEqual(words, txt.split())





[docs]
class TestSinicaTreebank(unittest.TestCase):

[docs]
    def test_sents(self):
        first_3_sents = sinica_treebank.sents()[:3]
        self.assertEqual(
            first_3_sents,
            [["一"], ["友情"], ["嘉珍", "和", "我", "住在", "同一條", "巷子"]],
        )



[docs]
    def test_parsed_sents(self):
        parsed_sents = sinica_treebank.parsed_sents()[25]
        self.assertEqual(
            parsed_sents,
            Tree(
                "S",
                [
                    Tree("NP", [Tree("Nba", ["嘉珍"])]),
                    Tree("V‧地", [Tree("VA11", ["不停"]), Tree("DE", ["的"])]),
                    Tree("VA4", ["哭泣"]),
                ],
            ),
        )





[docs]
class TestCoNLL2007(unittest.TestCase):
    # Reading the CoNLL 2007 Dependency Treebanks


[docs]
    def test_sents(self):
        sents = conll2007.sents("esp.train")[0]
        self.assertEqual(
            sents[:6], ["El", "aumento", "del", "índice", "de", "desempleo"]
        )



[docs]
    def test_parsed_sents(self):
        parsed_sents = conll2007.parsed_sents("esp.train")[0]

        self.assertEqual(
            parsed_sents.tree(),
            Tree(
                "fortaleció",
                [
                    Tree(
                        "aumento",
                        [
                            "El",
                            Tree(
                                "del",
                                [
                                    Tree(
                                        "índice",
                                        [
                                            Tree(
                                                "de",
                                                [Tree("desempleo", ["estadounidense"])],
                                            )
                                        ],
                                    )
                                ],
                            ),
                        ],
                    ),
                    "hoy",
                    "considerablemente",
                    Tree(
                        "al",
                        [
                            Tree(
                                "euro",
                                [
                                    Tree(
                                        "cotizaba",
                                        [
                                            ",",
                                            "que",
                                            Tree("a", [Tree("15.35", ["las", "GMT"])]),
                                            "se",
                                            Tree(
                                                "en",
                                                [
                                                    Tree(
                                                        "mercado",
                                                        [
                                                            "el",
                                                            Tree("de", ["divisas"]),
                                                            Tree("de", ["Fráncfort"]),
                                                        ],
                                                    )
                                                ],
                                            ),
                                            Tree("a", ["0,9452_dólares"]),
                                            Tree(
                                                "frente_a",
                                                [
                                                    ",",
                                                    Tree(
                                                        "0,9349_dólares",
                                                        [
                                                            "los",
                                                            Tree(
                                                                "de",
                                                                [
                                                                    Tree(
                                                                        "mañana",
                                                                        ["esta"],
                                                                    )
                                                                ],
                                                            ),
                                                        ],
                                                    ),
                                                ],
                                            ),
                                        ],
                                    )
                                ],
                            )
                        ],
                    ),
                    ".",
                ],
            ),
        )





[docs]
@pytest.mark.skipif(
    not ptb.fileids(),
    reason="A full installation of the Penn Treebank is not available",
)
class TestPTB(unittest.TestCase):

[docs]
    def test_fileids(self):
        self.assertEqual(
            ptb.fileids()[:4],
            [
                "BROWN/CF/CF01.MRG",
                "BROWN/CF/CF02.MRG",
                "BROWN/CF/CF03.MRG",
                "BROWN/CF/CF04.MRG",
            ],
        )



[docs]
    def test_words(self):
        self.assertEqual(
            ptb.words("WSJ/00/WSJ_0003.MRG")[:7],
            ["A", "form", "of", "asbestos", "once", "used", "*"],
        )



[docs]
    def test_tagged_words(self):
        self.assertEqual(
            ptb.tagged_words("WSJ/00/WSJ_0003.MRG")[:3],
            [("A", "DT"), ("form", "NN"), ("of", "IN")],
        )



[docs]
    def test_categories(self):
        self.assertEqual(
            ptb.categories(),
            [
                "adventure",
                "belles_lettres",
                "fiction",
                "humor",
                "lore",
                "mystery",
                "news",
                "romance",
                "science_fiction",
            ],
        )



[docs]
    def test_news_fileids(self):
        self.assertEqual(
            ptb.fileids("news")[:3],
            ["WSJ/00/WSJ_0001.MRG", "WSJ/00/WSJ_0002.MRG", "WSJ/00/WSJ_0003.MRG"],
        )



[docs]
    def test_category_words(self):
        self.assertEqual(
            ptb.words(categories=["humor", "fiction"])[:6],
            ["Thirty-three", "Scotty", "did", "not", "go", "back"],
        )





[docs]
@pytest.mark.skip("Skipping test for mwa_ppdb.")
class TestMWAPPDB(unittest.TestCase):

[docs]
    def test_fileids(self):
        self.assertEqual(
            mwa_ppdb.fileids(), ["ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs"]
        )



[docs]
    def test_entries(self):
        self.assertEqual(
            mwa_ppdb.entries()[:10],
            [
                ("10/17/01", "17/10/2001"),
                ("102,70", "102.70"),
                ("13,53", "13.53"),
                ("3.2.5.3.2.1", "3.2.5.3.2.1."),
                ("53,76", "53.76"),
                ("6.9.5", "6.9.5."),
                ("7.7.6.3", "7.7.6.3."),
                ("76,20", "76.20"),
                ("79,85", "79.85"),
                ("93,65", "93.65"),
            ],
        )
NLTK

Documentation

Source code for nltk.test.unit.test_corpora