Source code for nltk.test.unit.test_tokenize

"""
Unit tests for nltk.tokenize.
See also nltk/test/tokenize.doctest
"""

from typing import List, Tuple

import pytest

from nltk.tokenize import (
    LegalitySyllableTokenizer,
    StanfordSegmenter,
    SyllableTokenizer,
    TreebankWordTokenizer,
    TweetTokenizer,
    punkt,
    sent_tokenize,
    word_tokenize,
)
from nltk.tokenize.simple import CharTokenizer



[docs]
def load_stanford_segmenter():
    try:
        seg = StanfordSegmenter()
        seg.default_config("ar")
        seg.default_config("zh")
        return True
    except LookupError:
        return False



check_stanford_segmenter = pytest.mark.skipif(
    not load_stanford_segmenter(),
    reason="NLTK was unable to find stanford-segmenter.jar.",
)



[docs]
class TestTokenize:

[docs]
    def test_tweet_tokenizer(self):
        """
        Test TweetTokenizer using words with special and accented characters.
        """

        tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
        s9 = "@myke: Let's test these words: resumé España München français"
        tokens = tokenizer.tokenize(s9)
        expected = [
            ":",
            "Let's",
            "test",
            "these",
            "words",
            ":",
            "resumé",
            "España",
            "München",
            "français",
        ]
        assert tokens == expected



[docs]
    @pytest.mark.parametrize(
        "test_input, expecteds",
        [
            (
                "My text 0106404243030 is great text",
                (
                    ["My", "text", "01064042430", "30", "is", "great", "text"],
                    ["My", "text", "0106404243030", "is", "great", "text"],
                ),
            ),
            (
                "My ticket id is 1234543124123",
                (
                    ["My", "ticket", "id", "is", "12345431241", "23"],
                    ["My", "ticket", "id", "is", "1234543124123"],
                ),
            ),
            (
                "@remy: This is waaaaayyyy too much for you!!!!!! 01064042430",
                (
                    [
                        ":",
                        "This",
                        "is",
                        "waaayyy",
                        "too",
                        "much",
                        "for",
                        "you",
                        "!",
                        "!",
                        "!",
                        "01064042430",
                    ],
                    [
                        ":",
                        "This",
                        "is",
                        "waaayyy",
                        "too",
                        "much",
                        "for",
                        "you",
                        "!",
                        "!",
                        "!",
                        "01064042430",
                    ],
                ),
            ),
            # Further tests from https://github.com/nltk/nltk/pull/2798#issuecomment-922533085,
            # showing the TweetTokenizer performance for `match_phone_numbers=True` and
            # `match_phone_numbers=False`.
            (
                # Some phone numbers are always tokenized, even with `match_phone_numbers=`False`
                "My number is 06-46124080, except it's not.",
                (
                    [
                        "My",
                        "number",
                        "is",
                        "06-46124080",
                        ",",
                        "except",
                        "it's",
                        "not",
                        ".",
                    ],
                    [
                        "My",
                        "number",
                        "is",
                        "06-46124080",
                        ",",
                        "except",
                        "it's",
                        "not",
                        ".",
                    ],
                ),
            ),
            (
                # Phone number here is only tokenized correctly if `match_phone_numbers=True`
                "My number is 601-984-4813, except it's not.",
                (
                    [
                        "My",
                        "number",
                        "is",
                        "601-984-4813",
                        ",",
                        "except",
                        "it's",
                        "not",
                        ".",
                    ],
                    [
                        "My",
                        "number",
                        "is",
                        "601-984-",
                        "4813",
                        ",",
                        "except",
                        "it's",
                        "not",
                        ".",
                    ],
                ),
            ),
            (
                # Phone number here is only tokenized correctly if `match_phone_numbers=True`
                "My number is (393)  928 -3010, except it's not.",
                (
                    [
                        "My",
                        "number",
                        "is",
                        "(393)  928 -3010",
                        ",",
                        "except",
                        "it's",
                        "not",
                        ".",
                    ],
                    [
                        "My",
                        "number",
                        "is",
                        "(",
                        "393",
                        ")",
                        "928",
                        "-",
                        "3010",
                        ",",
                        "except",
                        "it's",
                        "not",
                        ".",
                    ],
                ),
            ),
            (
                # A long number is tokenized correctly only if `match_phone_numbers=False`
                "The product identification number is 48103284512.",
                (
                    [
                        "The",
                        "product",
                        "identification",
                        "number",
                        "is",
                        "4810328451",
                        "2",
                        ".",
                    ],
                    [
                        "The",
                        "product",
                        "identification",
                        "number",
                        "is",
                        "48103284512",
                        ".",
                    ],
                ),
            ),
            (
                # `match_phone_numbers=True` can have some unforeseen
                "My favourite substraction is 240 - 1353.",
                (
                    ["My", "favourite", "substraction", "is", "240 - 1353", "."],
                    ["My", "favourite", "substraction", "is", "240", "-", "1353", "."],
                ),
            ),
        ],
    )
    def test_tweet_tokenizer_expanded(
        self, test_input: str, expecteds: Tuple[List[str], List[str]]
    ):
        """
        Test `match_phone_numbers` in TweetTokenizer.

        Note that TweetTokenizer is also passed the following for these tests:
            * strip_handles=True
            * reduce_len=True

        :param test_input: The input string to tokenize using TweetTokenizer.
        :type test_input: str
        :param expecteds: A 2-tuple of tokenized sentences. The first of the two
            tokenized is the expected output of tokenization with `match_phone_numbers=True`.
            The second of the two tokenized lists is the expected output of tokenization
            with `match_phone_numbers=False`.
        :type expecteds: Tuple[List[str], List[str]]
        """
        for match_phone_numbers, expected in zip([True, False], expecteds):
            tokenizer = TweetTokenizer(
                strip_handles=True,
                reduce_len=True,
                match_phone_numbers=match_phone_numbers,
            )
            predicted = tokenizer.tokenize(test_input)
            assert predicted == expected



[docs]
    def test_sonority_sequencing_syllable_tokenizer(self):
        """
        Test SyllableTokenizer tokenizer.
        """
        tokenizer = SyllableTokenizer()
        tokens = tokenizer.tokenize("justification")
        assert tokens == ["jus", "ti", "fi", "ca", "tion"]



[docs]
    def test_syllable_tokenizer_numbers(self):
        """
        Test SyllableTokenizer tokenizer.
        """
        tokenizer = SyllableTokenizer()
        text = "9" * 10000
        tokens = tokenizer.tokenize(text)
        assert tokens == [text]



[docs]
    def test_legality_principle_syllable_tokenizer(self):
        """
        Test LegalitySyllableTokenizer tokenizer.
        """
        from nltk.corpus import words

        test_word = "wonderful"
        tokenizer = LegalitySyllableTokenizer(words.words())
        tokens = tokenizer.tokenize(test_word)
        assert tokens == ["won", "der", "ful"]



[docs]
    @check_stanford_segmenter
    def test_stanford_segmenter_arabic(self):
        """
        Test the Stanford Word Segmenter for Arabic (default config)
        """
        seg = StanfordSegmenter()
        seg.default_config("ar")
        sent = "يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات"
        segmented_sent = seg.segment(sent.split())
        assert segmented_sent.split() == [
            "يبحث",
            "علم",
            "الحاسوب",
            "استخدام",
            "الحوسبة",
            "ب",
            "جميع",
            "اشكال",
            "ها",
            "ل",
            "حل",
            "المشكلات",
        ]



[docs]
    @check_stanford_segmenter
    def test_stanford_segmenter_chinese(self):
        """
        Test the Stanford Word Segmenter for Chinese (default config)
        """
        seg = StanfordSegmenter()
        seg.default_config("zh")
        sent = "这是斯坦福中文分词器测试"
        segmented_sent = seg.segment(sent.split())
        assert segmented_sent.split() == [
            "这",
            "是",
            "斯坦福",
            "中文",
            "分词器",
            "测试",
        ]



[docs]
    def test_phone_tokenizer(self):
        """
        Test a string that resembles a phone number but contains a newline
        """

        # Should be recognized as a phone number, albeit one with multiple spaces
        tokenizer = TweetTokenizer()
        test1 = "(393)  928 -3010"
        expected = ["(393)  928 -3010"]
        result = tokenizer.tokenize(test1)
        assert result == expected

        # Due to newline, first three elements aren't part of a phone number;
        # fourth is
        test2 = "(393)\n928 -3010"
        expected = ["(", "393", ")", "928 -3010"]
        result = tokenizer.tokenize(test2)
        assert result == expected



[docs]
    def test_emoji_tokenizer(self):
        """
        Test a string that contains Emoji ZWJ Sequences and skin tone modifier
        """
        tokenizer = TweetTokenizer()

        # A Emoji ZWJ Sequences, they together build as a single emoji, should not be split.
        test1 = "👨‍👩‍👧‍👧"
        expected = ["👨‍👩‍👧‍👧"]
        result = tokenizer.tokenize(test1)
        assert result == expected

        # A Emoji with skin tone modifier, the two characters build a single emoji, should not be split.
        test2 = "👨🏿"
        expected = ["👨🏿"]
        result = tokenizer.tokenize(test2)
        assert result == expected

        # A string containing both skin tone modifier and ZWJ Sequences
        test3 = "🤔 🙈 me así, se😌 ds 💕👭👙 hello 👩🏾‍🎓 emoji hello 👨‍👩‍👦‍👦 how are 😊 you today🙅🏽🙅🏽"
        expected = [
            "🤔",
            "🙈",
            "me",
            "así",
            ",",
            "se",
            "😌",
            "ds",
            "💕",
            "👭",
            "👙",
            "hello",
            "👩🏾\u200d🎓",
            "emoji",
            "hello",
            "👨\u200d👩\u200d👦\u200d👦",
            "how",
            "are",
            "😊",
            "you",
            "today",
            "🙅🏽",
            "🙅🏽",
        ]
        result = tokenizer.tokenize(test3)
        assert result == expected

        # emoji flag sequences, including enclosed letter pairs
        # Expected behavior from #3034
        test4 = "🇦🇵🇵🇱🇪"
        expected = ["🇦🇵", "🇵🇱", "🇪"]
        result = tokenizer.tokenize(test4)
        assert result == expected

        test5 = "Hi 🇨🇦, 😍!!"
        expected = ["Hi", "🇨🇦", ",", "😍", "!", "!"]
        result = tokenizer.tokenize(test5)
        assert result == expected

        test6 = "<3 🇨🇦 🤝 🇵🇱 <3"
        expected = ["<3", "🇨🇦", "🤝", "🇵🇱", "<3"]
        result = tokenizer.tokenize(test6)
        assert result == expected



[docs]
    def test_pad_asterisk(self):
        """
        Test padding of asterisk for word tokenization.
        """
        text = "This is a, *weird sentence with *asterisks in it."
        expected = [
            "This",
            "is",
            "a",
            ",",
            "*",
            "weird",
            "sentence",
            "with",
            "*",
            "asterisks",
            "in",
            "it",
            ".",
        ]
        assert word_tokenize(text) == expected



[docs]
    def test_pad_dotdot(self):
        """
        Test padding of dotdot* for word tokenization.
        """
        text = "Why did dotdot.. not get tokenized but dotdotdot... did? How about manydots....."
        expected = [
            "Why",
            "did",
            "dotdot",
            "..",
            "not",
            "get",
            "tokenized",
            "but",
            "dotdotdot",
            "...",
            "did",
            "?",
            "How",
            "about",
            "manydots",
            ".....",
        ]
        assert word_tokenize(text) == expected



[docs]
    def test_remove_handle(self):
        """
        Test remove_handle() from casual.py with specially crafted edge cases
        """

        tokenizer = TweetTokenizer(strip_handles=True)

        # Simple example. Handles with just numbers should be allowed
        test1 = "@twitter hello @twi_tter_. hi @12345 @123news"
        expected = ["hello", ".", "hi"]
        result = tokenizer.tokenize(test1)
        assert result == expected

        # Handles are allowed to follow any of the following characters
        test2 = "@n`@n~@n(@n)@n-@n=@n+@n\\@n|@n[@n]@n{@n}@n;@n:@n'@n\"@n/@n?@n.@n,@n<@n>@n @n\n@n ñ@n.ü@n.ç@n."
        expected = [
            "`",
            "~",
            "(",
            ")",
            "-",
            "=",
            "+",
            "\\",
            "|",
            "[",
            "]",
            "{",
            "}",
            ";",
            ":",
            "'",
            '"',
            "/",
            "?",
            ".",
            ",",
            "<",
            ">",
            "ñ",
            ".",
            "ü",
            ".",
            "ç",
            ".",
        ]
        result = tokenizer.tokenize(test2)
        assert result == expected

        # Handles are NOT allowed to follow any of the following characters
        test3 = "a@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &@n *@n"
        expected = [
            "a",
            "@n",
            "j",
            "@n",
            "z",
            "@n",
            "A",
            "@n",
            "L",
            "@n",
            "Z",
            "@n",
            "1",
            "@n",
            "4",
            "@n",
            "7",
            "@n",
            "9",
            "@n",
            "0",
            "@n",
            "_",
            "@n",
            "!",
            "@n",
            "@",
            "@n",
            "#",
            "@n",
            "$",
            "@n",
            "%",
            "@n",
            "&",
            "@n",
            "*",
            "@n",
        ]
        result = tokenizer.tokenize(test3)
        assert result == expected

        # Handles are allowed to precede the following characters
        test4 = "@n!a @n#a @n$a @n%a @n&a @n*a"
        expected = ["!", "a", "#", "a", "$", "a", "%", "a", "&", "a", "*", "a"]
        result = tokenizer.tokenize(test4)
        assert result == expected

        # Tests interactions with special symbols and multiple @
        test5 = "@n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n@n @@n @n@@n @n_@n @n7@n @nj@n"
        expected = [
            "!",
            "@n",
            "#",
            "@n",
            "$",
            "@n",
            "%",
            "@n",
            "&",
            "@n",
            "*",
            "@n",
            "@n",
            "@n",
            "@",
            "@n",
            "@n",
            "@",
            "@n",
            "@n_",
            "@n",
            "@n7",
            "@n",
            "@nj",
            "@n",
        ]
        result = tokenizer.tokenize(test5)
        assert result == expected

        # Tests that handles can have a max length of 15
        test6 = "@abcdefghijklmnopqrstuvwxyz @abcdefghijklmno1234 @abcdefghijklmno_ @abcdefghijklmnoendofhandle"
        expected = ["pqrstuvwxyz", "1234", "_", "endofhandle"]
        result = tokenizer.tokenize(test6)
        assert result == expected

        # Edge case where an @ comes directly after a long handle
        test7 = "@abcdefghijklmnop@abcde @abcdefghijklmno@abcde @abcdefghijklmno_@abcde @abcdefghijklmno5@abcde"
        expected = [
            "p",
            "@abcde",
            "@abcdefghijklmno",
            "@abcde",
            "_",
            "@abcde",
            "5",
            "@abcde",
        ]
        result = tokenizer.tokenize(test7)
        assert result == expected



[docs]
    def test_treebank_span_tokenizer(self):
        """
        Test TreebankWordTokenizer.span_tokenize function
        """

        tokenizer = TreebankWordTokenizer()

        # Test case in the docstring
        test1 = "Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks)."
        expected = [
            (0, 4),
            (5, 12),
            (13, 17),
            (18, 19),
            (19, 23),
            (24, 26),
            (27, 30),
            (31, 32),
            (32, 36),
            (36, 37),
            (37, 38),
            (40, 46),
            (47, 48),
            (48, 51),
            (51, 52),
            (53, 55),
            (56, 59),
            (60, 62),
            (63, 68),
            (69, 70),
            (70, 76),
            (76, 77),
            (77, 78),
        ]
        result = list(tokenizer.span_tokenize(test1))
        assert result == expected

        # Test case with double quotation
        test2 = 'The DUP is similar to the "religious right" in the United States and takes a hardline stance on social issues'
        expected = [
            (0, 3),
            (4, 7),
            (8, 10),
            (11, 18),
            (19, 21),
            (22, 25),
            (26, 27),
            (27, 36),
            (37, 42),
            (42, 43),
            (44, 46),
            (47, 50),
            (51, 57),
            (58, 64),
            (65, 68),
            (69, 74),
            (75, 76),
            (77, 85),
            (86, 92),
            (93, 95),
            (96, 102),
            (103, 109),
        ]
        result = list(tokenizer.span_tokenize(test2))
        assert result == expected

        # Test case with double qoutation as well as converted quotations
        test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues"
        expected = [
            (0, 3),
            (4, 7),
            (8, 10),
            (11, 18),
            (19, 21),
            (22, 25),
            (26, 27),
            (27, 36),
            (37, 42),
            (42, 43),
            (44, 46),
            (47, 50),
            (51, 57),
            (58, 64),
            (65, 68),
            (69, 74),
            (75, 76),
            (77, 79),
            (79, 87),
            (87, 89),
            (90, 96),
            (97, 99),
            (100, 106),
            (107, 113),
        ]
        result = list(tokenizer.span_tokenize(test3))
        assert result == expected



[docs]
    def test_word_tokenize(self):
        """
        Test word_tokenize function
        """

        sentence = "The 'v', I've been fooled but I'll seek revenge."
        expected = [
            "The",
            "'",
            "v",
            "'",
            ",",
            "I",
            "'ve",
            "been",
            "fooled",
            "but",
            "I",
            "'ll",
            "seek",
            "revenge",
            ".",
        ]
        assert word_tokenize(sentence) == expected

        sentence = "'v' 're'"
        expected = ["'", "v", "'", "'re", "'"]
        assert word_tokenize(sentence) == expected



[docs]
    def test_punkt_pair_iter(self):
        test_cases = [
            ("12", [("1", "2"), ("2", None)]),
            ("123", [("1", "2"), ("2", "3"), ("3", None)]),
            ("1234", [("1", "2"), ("2", "3"), ("3", "4"), ("4", None)]),
        ]

        for test_input, expected_output in test_cases:
            actual_output = [x for x in punkt._pair_iter(test_input)]

            assert actual_output == expected_output



[docs]
    def test_punkt_pair_iter_handles_stop_iteration_exception(self):
        # test input to trigger StopIteration from next()
        it = iter([])
        # call method under test and produce a generator
        gen = punkt._pair_iter(it)
        # unpack generator, ensure that no error is raised
        list(gen)



[docs]
    def test_punkt_tokenize_words_handles_stop_iteration_exception(self):
        obj = punkt.PunktBaseClass()

        class TestPunktTokenizeWordsMock:
            def word_tokenize(self, s):
                return iter([])

        obj._lang_vars = TestPunktTokenizeWordsMock()
        # unpack generator, ensure that no error is raised
        list(obj._tokenize_words("test"))



[docs]
    def test_punkt_tokenize_custom_lang_vars(self):
        # Create LangVars including a full stop end character as used in Bengali
        class BengaliLanguageVars(punkt.PunktLanguageVars):
            sent_end_chars = (".", "?", "!", "\u0964")

        obj = punkt.PunktSentenceTokenizer(lang_vars=BengaliLanguageVars())

        # We now expect these sentences to be split up into the individual sentences
        sentences = "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’  উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।"
        expected = [
            "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন।",
            "অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’  উপস্থিত ছিলেন।",
            "এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।",
        ]

        assert obj.tokenize(sentences) == expected



[docs]
    def test_punkt_tokenize_no_custom_lang_vars(self):
        obj = punkt.PunktSentenceTokenizer()

        # We expect these sentences to not be split properly, as the Bengali full stop '।' is not included in the default language vars
        sentences = "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’  উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।"
        expected = [
            "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’  উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।"
        ]

        assert obj.tokenize(sentences) == expected



[docs]
    @pytest.mark.parametrize(
        "input_text,n_sents,n_splits,lang_vars",
        [
            # Test debug_decisions on a text with two sentences, split by a dot.
            ("Subject: Some subject. Attachments: Some attachments", 2, 1),
            # The sentence should be split into two sections,
            # with one split and hence one decision.
            # Test debug_decisions on a text with two sentences, split by an exclamation mark.
            ("Subject: Some subject! Attachments: Some attachments", 2, 1),
            # The sentence should be split into two sections,
            # with one split and hence one decision.
            # Test debug_decisions on a text with one sentences,
            # which is not split.
            ("This is just a normal sentence, just like any other.", 1, 0),
            # Hence just 1
        ],
    )
    def punkt_debug_decisions(self, input_text, n_sents, n_splits, lang_vars=None):
        tokenizer = punkt.PunktSentenceTokenizer()
        if lang_vars != None:
            tokenizer._lang_vars = lang_vars

        assert len(tokenizer.tokenize(input_text)) == n_sents
        assert len(list(tokenizer.debug_decisions(input_text))) == n_splits



[docs]
    def test_punkt_debug_decisions_custom_end(self):
        # Test debug_decisions on a text with two sentences,
        # split by a custom end character, based on Issue #2519
        class ExtLangVars(punkt.PunktLanguageVars):
            sent_end_chars = (".", "?", "!", "^")

        self.punkt_debug_decisions(
            "Subject: Some subject^ Attachments: Some attachments",
            n_sents=2,
            n_splits=1,
            lang_vars=ExtLangVars(),
        )

        # The sentence should be split into two sections,
        # with one split and hence one decision.


[docs]
    @pytest.mark.parametrize(
        "sentences, expected",
        [
            (
                "this is a test. . new sentence.",
                ["this is a test.", ".", "new sentence."],
            ),
            ("This. . . That", ["This.", ".", ".", "That"]),
            ("This..... That", ["This..... That"]),
            ("This... That", ["This... That"]),
            ("This.. . That", ["This.. .", "That"]),
            ("This. .. That", ["This.", ".. That"]),
            ("This. ,. That", ["This.", ",.", "That"]),
            ("This!!! That", ["This!!!", "That"]),
            ("This! That", ["This!", "That"]),
            (
                "1. This is R .\n2. This is A .\n3. That's all",
                ["1.", "This is R .", "2.", "This is A .", "3.", "That's all"],
            ),
            (
                "1. This is R .\t2. This is A .\t3. That's all",
                ["1.", "This is R .", "2.", "This is A .", "3.", "That's all"],
            ),
            ("Hello.\tThere", ["Hello.", "There"]),
        ],
    )
    def test_sent_tokenize(self, sentences: str, expected: List[str]):
        assert sent_tokenize(sentences) == expected



[docs]
    def test_string_tokenizer(self) -> None:
        sentence = "Hello there"
        tokenizer = CharTokenizer()
        assert tokenizer.tokenize(sentence) == list(sentence)
        assert list(tokenizer.span_tokenize(sentence)) == [
            (0, 1),
            (1, 2),
            (2, 3),
            (3, 4),
            (4, 5),
            (5, 6),
            (6, 7),
            (7, 8),
            (8, 9),
            (9, 10),
            (10, 11),
        ]





[docs]
class TestPunktTrainer:

[docs]
    def test_punkt_train(self) -> None:
        trainer = punkt.PunktTrainer()
        trainer.train("This is a test.")



[docs]
    def test_punkt_train_single_word(self) -> None:
        trainer = punkt.PunktTrainer()
        trainer.train("This.")



[docs]
    def test_punkt_train_no_punc(self) -> None:
        trainer = punkt.PunktTrainer()
        trainer.train("This is a test")