Source code for nltk.classify.textcat

# Natural Language Toolkit: Language ID module using TextCat algorithm
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Avital Pekker <avital.pekker@utoronto.ca>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

"""
A module for language identification using the TextCat algorithm.
An implementation of the text categorization algorithm
presented in Cavnar, W. B. and J. M. Trenkle,
"N-Gram-Based Text Categorization".

The algorithm takes advantage of Zipf's law and uses
n-gram frequencies to profile languages and text-yet to
be identified-then compares using a distance measure.

Language n-grams are provided by the "An Crubadan"
project. A corpus reader was created separately to read
those files.

For details regarding the algorithm, see:
https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf

For details about An Crubadan, see:
https://borel.slu.edu/crubadan/index.html
"""

from sys import maxsize

from nltk.util import trigrams

# Note: this is NOT "re" you're likely used to. The regex module
# is an alternative to the standard re module that supports
# Unicode codepoint properties with the \p{} syntax.
# You may have to "pip install regx"
try:
    import regex as re
except ImportError:
    re = None
######################################################################
##  Language identification using TextCat
######################################################################


[docs]class TextCat: _corpus = None fingerprints = {} _START_CHAR = "<" _END_CHAR = ">" last_distances = {}
[docs] def __init__(self): if not re: raise OSError( "classify.textcat requires the regex module that " "supports unicode. Try '$ pip install regex' and " "see https://pypi.python.org/pypi/regex for " "further details." ) from nltk.corpus import crubadan self._corpus = crubadan # Load all language ngrams into cache for lang in self._corpus.langs(): self._corpus.lang_freq(lang)
[docs] def remove_punctuation(self, text): """Get rid of punctuation except apostrophes""" return re.sub(r"[^\P{P}\']+", "", text)
[docs] def profile(self, text): """Create FreqDist of trigrams within text""" from nltk import FreqDist, word_tokenize clean_text = self.remove_punctuation(text) tokens = word_tokenize(clean_text) fingerprint = FreqDist() for t in tokens: token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR) token_trigrams = ["".join(tri) for tri in token_trigram_tuples] for cur_trigram in token_trigrams: if cur_trigram in fingerprint: fingerprint[cur_trigram] += 1 else: fingerprint[cur_trigram] = 1 return fingerprint
[docs] def calc_dist(self, lang, trigram, text_profile): """Calculate the "out-of-place" measure between the text and language profile for a single trigram""" lang_fd = self._corpus.lang_freq(lang) dist = 0 if trigram in lang_fd: idx_lang_profile = list(lang_fd.keys()).index(trigram) idx_text = list(text_profile.keys()).index(trigram) # print(idx_lang_profile, ", ", idx_text) dist = abs(idx_lang_profile - idx_text) else: # Arbitrary but should be larger than # any possible trigram file length # in terms of total lines dist = maxsize return dist
[docs] def lang_dists(self, text): """Calculate the "out-of-place" measure between the text and all languages""" distances = {} profile = self.profile(text) # For all the languages for lang in self._corpus._all_lang_freq.keys(): # Calculate distance metric for every trigram in # input text to be identified lang_dist = 0 for trigram in profile: lang_dist += self.calc_dist(lang, trigram, profile) distances[lang] = lang_dist return distances
[docs] def guess_language(self, text): """Find the language with the min distance to the text and return its ISO 639-3 code""" self.last_distances = self.lang_dists(text) return min(self.last_distances, key=self.last_distances.get)
#################################################')
[docs]def demo(): from nltk.corpus import udhr langs = [ "Kurdish-UTF8", "Abkhaz-UTF8", "Farsi_Persian-UTF8", "Hindi-UTF8", "Hawaiian-UTF8", "Russian-UTF8", "Vietnamese-UTF8", "Serbian_Srpski-UTF8", "Esperanto-UTF8", ] friendly = { "kmr": "Northern Kurdish", "abk": "Abkhazian", "pes": "Iranian Persian", "hin": "Hindi", "haw": "Hawaiian", "rus": "Russian", "vie": "Vietnamese", "srp": "Serbian", "epo": "Esperanto", } tc = TextCat() for cur_lang in langs: # Get raw data from UDHR corpus raw_sentences = udhr.sents(cur_lang) rows = len(raw_sentences) - 1 cols = list(map(len, raw_sentences)) sample = "" # Generate a sample text of the language for i in range(0, rows): cur_sent = "" for j in range(0, cols[i]): cur_sent += " " + raw_sentences[i][j] sample += cur_sent # Try to detect what it is print("Language snippet: " + sample[0:140] + "...") guess = tc.guess_language(sample) print(f"Language detection: {guess} ({friendly[guess]})") print("#" * 140)
if __name__ == "__main__": demo()