# Natural Language Toolkit: Sentiment Analyzer
# Copyright (C) 2001-2024 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
A SentimentAnalyzer is a tool to implement and facilitate Sentiment Analysis tasks
using NLTK features and classifiers, especially for teaching and demonstrative
import sys
from collections import defaultdict
from nltk.classify.util import accuracy as eval_accuracy
from nltk.classify.util import apply_features
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.metrics import f_measure as eval_f_measure
from nltk.metrics import precision as eval_precision
from nltk.metrics import recall as eval_recall
from nltk.probability import FreqDist
class SentimentAnalyzer:
A Sentiment Analysis tool based on machine learning approaches.
def __init__(self, classifier=None):
self.feat_extractors = defaultdict(list)
self.classifier = classifier
def all_words(self, documents, labeled=None):
Return all words/tokens from the documents (with duplicates).
:param documents: a list of (words, label) tuples.
:param labeled: if `True`, assume that each document is represented by a
(words, label) tuple: (list(str), str). If `False`, each document is
considered as being a simple list of strings: list(str).
:rtype: list(str)
:return: A list of all words/tokens in `documents`.
all_words = []
if labeled is None:
labeled = documents and isinstance(documents[0], tuple)
if labeled:
for words, _sentiment in documents:
elif not labeled:
for words in documents:
return all_words
def apply_features(self, documents, labeled=None):
Apply all feature extractor functions to the documents. This is a wrapper
around `nltk.classify.util.apply_features`.
If `labeled=False`, return featuresets as:
[feature_func(doc) for doc in documents]
If `labeled=True`, return featuresets as:
[(feature_func(tok), label) for (tok, label) in toks]
:param documents: a list of documents. `If labeled=True`, the method expects
a list of (words, label) tuples.
:rtype: LazyMap
return apply_features(self.extract_features, documents, labeled)
def unigram_word_feats(self, words, top_n=None, min_freq=0):
Return most common top_n word features.
:param words: a list of words/tokens.
:param top_n: number of best words/tokens to use, sorted by frequency.
:rtype: list(str)
:return: A list of `top_n` words/tokens (with no duplicates) sorted by
# Stopwords are not removed
unigram_feats_freqs = FreqDist(word for word in words)
return [
for w, f in unigram_feats_freqs.most_common(top_n)
if unigram_feats_freqs[w] > min_freq
def bigram_collocation_feats(
self, documents, top_n=None, min_freq=3, assoc_measure=BigramAssocMeasures.pmi
Return `top_n` bigram features (using `assoc_measure`).
Note that this method is based on bigram collocations measures, and not
on simple bigram frequency.
:param documents: a list (or iterable) of tokens.
:param top_n: number of best words/tokens to use, sorted by association
:param assoc_measure: bigram association measure to use as score function.
:param min_freq: the minimum number of occurrencies of bigrams to take
into consideration.
:return: `top_n` ngrams scored by the given association measure.
finder = BigramCollocationFinder.from_documents(documents)
return finder.nbest(assoc_measure, top_n)
def classify(self, instance):
Classify a single instance applying the features that have already been
stored in the SentimentAnalyzer.
:param instance: a list (or iterable) of tokens.
:return: the classification result given by applying the classifier.
instance_feats = self.apply_features([instance], labeled=False)
return self.classifier.classify(instance_feats[0])
def train(self, trainer, training_set, save_classifier=None, **kwargs):
Train classifier on the training set, optionally saving the output in the
file specified by `save_classifier`.
Additional arguments depend on the specific trainer used. For example,
a MaxentClassifier can use `max_iter` parameter to specify the number
of iterations, while a NaiveBayesClassifier cannot.
:param trainer: `train` method of a classifier.
E.g.: NaiveBayesClassifier.train
:param training_set: the training set to be passed as argument to the
classifier `train` method.
:param save_classifier: the filename of the file where the classifier
will be stored (optional).
:param kwargs: additional parameters that will be passed as arguments to
the classifier `train` function.
:return: A classifier instance trained on the training set.
print("Training classifier")
self.classifier = trainer(training_set, **kwargs)
if save_classifier:
self.save_file(self.classifier, save_classifier)
return self.classifier
def save_file(self, content, filename):
Store `content` in `filename`. Can be used to store a SentimentAnalyzer.
print("Saving", filename, file=sys.stderr)
with open(filename, "wb") as storage_file:
import pickle
# The protocol=2 parameter is for python2 compatibility
pickle.dump(content, storage_file, protocol=2)
def evaluate(
Evaluate and print classifier performance on the test set.
:param test_set: A list of (tokens, label) tuples to use as gold set.
:param classifier: a classifier instance (previously trained).
:param accuracy: if `True`, evaluate classifier accuracy.
:param f_measure: if `True`, evaluate classifier f_measure.
:param precision: if `True`, evaluate classifier precision.
:param recall: if `True`, evaluate classifier recall.
:return: evaluation results.
:rtype: dict(str): float
if classifier is None:
classifier = self.classifier
print(f"Evaluating {type(classifier).__name__} results...")
metrics_results = {}
if accuracy:
accuracy_score = eval_accuracy(classifier, test_set)
metrics_results["Accuracy"] = accuracy_score
gold_results = defaultdict(set)
test_results = defaultdict(set)
labels = set()
for i, (feats, label) in enumerate(test_set):
observed = classifier.classify(feats)
for label in labels:
if precision:
precision_score = eval_precision(
gold_results[label], test_results[label]
metrics_results[f"Precision [{label}]"] = precision_score
if recall:
recall_score = eval_recall(gold_results[label], test_results[label])
metrics_results[f"Recall [{label}]"] = recall_score
if f_measure:
f_measure_score = eval_f_measure(
gold_results[label], test_results[label]
metrics_results[f"F-measure [{label}]"] = f_measure_score
# Print evaluation results (in alphabetical order)
if verbose:
for result in sorted(metrics_results):
print(f"{result}: {metrics_results[result]}")
return metrics_results