import unittest
import pytest
from nltk.corpus import ( # mwa_ppdb
cess_cat,
cess_esp,
conll2007,
floresta,
indian,
ptb,
sinica_treebank,
udhr,
)
from nltk.tree import Tree
[docs]
class TestUdhr(unittest.TestCase):
[docs]
def test_words(self):
for name in udhr.fileids():
words = list(udhr.words(name))
self.assertTrue(words)
[docs]
def test_raw_unicode(self):
for name in udhr.fileids():
txt = udhr.raw(name)
assert not isinstance(txt, bytes), name
[docs]
def test_polish_encoding(self):
text_pl = udhr.raw("Polish-Latin2")[:164]
text_ppl = udhr.raw("Polish_Polski-Latin2")[:164]
expected = """POWSZECHNA DEKLARACJA PRAW CZŁOWIEKA
[Preamble]
Trzecia Sesja Ogólnego Zgromadzenia ONZ, obradująca w Paryżu, \
uchwaliła 10 grudnia 1948 roku jednomyślnie Powszechną"""
assert text_pl == expected, "Polish-Latin2"
assert text_ppl == expected, "Polish_Polski-Latin2"
[docs]
class TestIndian(unittest.TestCase):
[docs]
def test_words(self):
words = indian.words()[:3]
self.assertEqual(words, ["মহিষের", "সন্তান", ":"])
[docs]
def test_tagged_words(self):
tagged_words = indian.tagged_words()[:3]
self.assertEqual(
tagged_words, [("মহিষের", "NN"), ("সন্তান", "NN"), (":", "SYM")]
)
[docs]
class TestCess(unittest.TestCase):
[docs]
def test_catalan(self):
words = cess_cat.words()[:15]
txt = "El Tribunal_Suprem -Fpa- TS -Fpt- ha confirmat la condemna a quatre anys d' inhabilitació especial"
self.assertEqual(words, txt.split())
self.assertEqual(cess_cat.tagged_sents()[0][34][0], "càrrecs")
[docs]
def test_esp(self):
words = cess_esp.words()[:15]
txt = "El grupo estatal Electricité_de_France -Fpa- EDF -Fpt- anunció hoy , jueves , la compra del"
self.assertEqual(words, txt.split())
self.assertEqual(cess_esp.words()[115], "años")
[docs]
class TestFloresta(unittest.TestCase):
[docs]
def test_words(self):
words = floresta.words()[:10]
txt = "Um revivalismo refrescante O 7_e_Meio é um ex-libris de a"
self.assertEqual(words, txt.split())
[docs]
class TestSinicaTreebank(unittest.TestCase):
[docs]
def test_sents(self):
first_3_sents = sinica_treebank.sents()[:3]
self.assertEqual(
first_3_sents,
[["一"], ["友情"], ["嘉珍", "和", "我", "住在", "同一條", "巷子"]],
)
[docs]
def test_parsed_sents(self):
parsed_sents = sinica_treebank.parsed_sents()[25]
self.assertEqual(
parsed_sents,
Tree(
"S",
[
Tree("NP", [Tree("Nba", ["嘉珍"])]),
Tree("V‧地", [Tree("VA11", ["不停"]), Tree("DE", ["的"])]),
Tree("VA4", ["哭泣"]),
],
),
)
[docs]
class TestCoNLL2007(unittest.TestCase):
# Reading the CoNLL 2007 Dependency Treebanks
[docs]
def test_sents(self):
sents = conll2007.sents("esp.train")[0]
self.assertEqual(
sents[:6], ["El", "aumento", "del", "índice", "de", "desempleo"]
)
[docs]
def test_parsed_sents(self):
parsed_sents = conll2007.parsed_sents("esp.train")[0]
self.assertEqual(
parsed_sents.tree(),
Tree(
"fortaleció",
[
Tree(
"aumento",
[
"El",
Tree(
"del",
[
Tree(
"índice",
[
Tree(
"de",
[Tree("desempleo", ["estadounidense"])],
)
],
)
],
),
],
),
"hoy",
"considerablemente",
Tree(
"al",
[
Tree(
"euro",
[
Tree(
"cotizaba",
[
",",
"que",
Tree("a", [Tree("15.35", ["las", "GMT"])]),
"se",
Tree(
"en",
[
Tree(
"mercado",
[
"el",
Tree("de", ["divisas"]),
Tree("de", ["Fráncfort"]),
],
)
],
),
Tree("a", ["0,9452_dólares"]),
Tree(
"frente_a",
[
",",
Tree(
"0,9349_dólares",
[
"los",
Tree(
"de",
[
Tree(
"mañana",
["esta"],
)
],
),
],
),
],
),
],
)
],
)
],
),
".",
],
),
)
[docs]
@pytest.mark.skipif(
not ptb.fileids(),
reason="A full installation of the Penn Treebank is not available",
)
class TestPTB(unittest.TestCase):
[docs]
def test_fileids(self):
self.assertEqual(
ptb.fileids()[:4],
[
"BROWN/CF/CF01.MRG",
"BROWN/CF/CF02.MRG",
"BROWN/CF/CF03.MRG",
"BROWN/CF/CF04.MRG",
],
)
[docs]
def test_words(self):
self.assertEqual(
ptb.words("WSJ/00/WSJ_0003.MRG")[:7],
["A", "form", "of", "asbestos", "once", "used", "*"],
)
[docs]
def test_tagged_words(self):
self.assertEqual(
ptb.tagged_words("WSJ/00/WSJ_0003.MRG")[:3],
[("A", "DT"), ("form", "NN"), ("of", "IN")],
)
[docs]
def test_categories(self):
self.assertEqual(
ptb.categories(),
[
"adventure",
"belles_lettres",
"fiction",
"humor",
"lore",
"mystery",
"news",
"romance",
"science_fiction",
],
)
[docs]
def test_news_fileids(self):
self.assertEqual(
ptb.fileids("news")[:3],
["WSJ/00/WSJ_0001.MRG", "WSJ/00/WSJ_0002.MRG", "WSJ/00/WSJ_0003.MRG"],
)
[docs]
def test_category_words(self):
self.assertEqual(
ptb.words(categories=["humor", "fiction"])[:6],
["Thirty-three", "Scotty", "did", "not", "go", "back"],
)
[docs]
@pytest.mark.skip("Skipping test for mwa_ppdb.")
class TestMWAPPDB(unittest.TestCase):
[docs]
def test_fileids(self):
self.assertEqual(
mwa_ppdb.fileids(), ["ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs"]
)
[docs]
def test_entries(self):
self.assertEqual(
mwa_ppdb.entries()[:10],
[
("10/17/01", "17/10/2001"),
("102,70", "102.70"),
("13,53", "13.53"),
("3.2.5.3.2.1", "3.2.5.3.2.1."),
("53,76", "53.76"),
("6.9.5", "6.9.5."),
("7.7.6.3", "7.7.6.3."),
("76,20", "76.20"),
("79,85", "79.85"),
("93,65", "93.65"),
],
)