Source code for nltk.test.unit.test_seekable_unicode_stream_reader

import os
from io import BytesIO

import pytest

from nltk.corpus.reader import SeekableUnicodeStreamReader


[docs]def check_reader(unicode_string, encoding):
    bytestr = unicode_string.encode(encoding)
    stream = BytesIO(bytestr)
    reader = SeekableUnicodeStreamReader(stream, encoding)

    # Should open at the start of the file
    assert reader.tell() == 0

    # Compare original string to contents from `.readlines()`
    assert unicode_string == "".join(reader.readlines())

    # Should be at the end of the file now
    stream.seek(0, os.SEEK_END)
    assert reader.tell() == stream.tell()

    reader.seek(0)  # go back to start

    # Compare original string to contents from `.read()`
    contents = ""
    char = None
    while char != "":
        char = reader.read(1)
        contents += char
    assert unicode_string == contents


# Call `check_reader` with a variety of input strings and encodings.
ENCODINGS = ["ascii", "latin1", "greek", "hebrew", "utf-16", "utf-8"]

STRINGS = [
    """
    This is a test file.
    It is fairly short.
    """,
    "This file can be encoded with latin1. \x83",
    """\
    This is a test file.
    Here's a blank line:

    And here's some unicode: \xee \u0123 \uffe3
    """,
    """\
    This is a test file.
    Unicode characters: \xf3 \u2222 \u3333\u4444 \u5555
    """,
    """\
    This is a larger file.  It has some lines that are longer \
    than 72 characters.  It's got lots of repetition.  Here's \
    some unicode chars: \xee \u0123 \uffe3 \ueeee \u2345

    How fun!  Let's repeat it twenty times.
    """
    * 20,
]


[docs]@pytest.mark.parametrize("string", STRINGS)
def test_reader(string):
    for encoding in ENCODINGS:
        # skip strings that can't be encoded with the current encoding
        try:
            string.encode(encoding)
        except UnicodeEncodeError:
            continue
        check_reader(string, encoding)


[docs]def test_reader_stream_closes_when_deleted():
    reader = SeekableUnicodeStreamReader(BytesIO(b""), "ascii")
    assert not reader.stream.closed
    reader.__del__()
    assert reader.stream.closed


[docs]def teardown_module(module=None):
    import gc

    gc.collect()
NLTK

Documentation

Source code for nltk.test.unit.test_seekable_unicode_stream_reader