Source code for nltk.book

# Natural Language Toolkit: Some texts for exploration in chapter 1 of the book
#
# Copyright (C) 2001-2021 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

from nltk.corpus import (
    genesis,
    gutenberg,
    inaugural,
    nps_chat,
    treebank,
    webtext,
    wordnet,
)
from nltk.probability import FreqDist
from nltk.text import Text
from nltk.util import bigrams

print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")

text1 = Text(gutenberg.words("melville-moby_dick.txt"))
print("text1:", text1.name)

text2 = Text(gutenberg.words("austen-sense.txt"))
print("text2:", text2.name)

text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words("singles.txt"), name="Personals Corpus")
print("text8:", text8.name)

text9 = Text(gutenberg.words("chesterton-thursday.txt"))
print("text9:", text9.name)


[docs]def texts(): print("text1:", text1.name) print("text2:", text2.name) print("text3:", text3.name) print("text4:", text4.name) print("text5:", text5.name) print("text6:", text6.name) print("text7:", text7.name) print("text8:", text8.name) print("text9:", text9.name)
sent1 = ["Call", "me", "Ishmael", "."] sent2 = [ "The", "family", "of", "Dashwood", "had", "long", "been", "settled", "in", "Sussex", ".", ] sent3 = [ "In", "the", "beginning", "God", "created", "the", "heaven", "and", "the", "earth", ".", ] sent4 = [ "Fellow", "-", "Citizens", "of", "the", "Senate", "and", "of", "the", "House", "of", "Representatives", ":", ] sent5 = [ "I", "have", "a", "problem", "with", "people", "PMing", "me", "to", "lol", "JOIN", ] sent6 = [ "SCENE", "1", ":", "[", "wind", "]", "[", "clop", "clop", "clop", "]", "KING", "ARTHUR", ":", "Whoa", "there", "!", ] sent7 = [ "Pierre", "Vinken", ",", "61", "years", "old", ",", "will", "join", "the", "board", "as", "a", "nonexecutive", "director", "Nov.", "29", ".", ] sent8 = [ "25", "SEXY", "MALE", ",", "seeks", "attrac", "older", "single", "lady", ",", "for", "discreet", "encounters", ".", ] sent9 = [ "THE", "suburb", "of", "Saffron", "Park", "lay", "on", "the", "sunset", "side", "of", "London", ",", "as", "red", "and", "ragged", "as", "a", "cloud", "of", "sunset", ".", ]
[docs]def sents(): print("sent1:", " ".join(sent1)) print("sent2:", " ".join(sent2)) print("sent3:", " ".join(sent3)) print("sent4:", " ".join(sent4)) print("sent5:", " ".join(sent5)) print("sent6:", " ".join(sent6)) print("sent7:", " ".join(sent7)) print("sent8:", " ".join(sent8)) print("sent9:", " ".join(sent9))