# Natural Language Toolkit: Python port of the tokenizer.
# Copyright (C) 2001-2015 NLTK Project
# Author: Liling Tan (ported from
# Contributors: Ozan Caglayan, Wiktor Stribizew
# URL: <>
# For license information, see LICENSE.TXT

This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,
which was also ported into Python in

import io
import re

from nltk.corpus import perluniprops
from nltk.tokenize.api import TokenizerI
from nltk.tokenize.util import xml_unescape

[docs] class NISTTokenizer(TokenizerI): """ This NIST tokenizer is sentence-based instead of the original paragraph-based tokenization from; The sentence-based tokenization is consistent with the other tokenizers available in NLTK. >>> from nltk.tokenize.nist import NISTTokenizer >>> nist = NISTTokenizer() >>> s = "Good muffins cost $3.88 in New York." >>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.'] >>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.'] >>> nist.tokenize(s, lowercase=False) == expected_cased True >>> nist.tokenize(s, lowercase=True) == expected_lower # Lowercased. True The international_tokenize() is the preferred function when tokenizing non-european text, e.g. >>> from nltk.tokenize.nist import NISTTokenizer >>> nist = NISTTokenizer() # Input strings. >>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) is a Chinese e-commerce company...' >>> amz = u', Inc. (/ˈæməzɒn/) is an American electronic commerce...' >>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.' # Expected tokens. >>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'\u963f\u91cc\u5df4\u5df4\u96c6\u56e2\u63a7\u80a1', u'\u6709\u9650\u516c\u53f8', u')'] >>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'\u02c8\xe6', u'm'] >>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'\u697d\u5929\u682a\u5f0f\u4f1a\u793e', u'Rakuten', u'Kabushiki', u'-', u'gaisha'] >>> nist.international_tokenize(albb)[:10] == expected_albb True >>> nist.international_tokenize(amz)[:10] == expected_amz True >>> nist.international_tokenize(rkt)[:10] == expected_rkt True # Doctest for patching issue #1926 >>> sent = u'this is a foo\u2604sentence.' >>> expected_sent = [u'this', u'is', u'a', u'foo', u'\u2604', u'sentence', u'.'] >>> nist.international_tokenize(sent) == expected_sent True """ # Strip "skipped" tags STRIP_SKIP = re.compile("<skipped>"), "" # Strip end-of-line hyphenation and join lines STRIP_EOL_HYPHEN = re.compile("\u2028"), " " # Tokenize punctuation. PUNCT = re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), " \\1 " # Tokenize period and comma unless preceded by a digit. PERIOD_COMMA_PRECEED = re.compile(r"([^0-9])([\.,])"), "\\1 \\2 " # Tokenize period and comma unless followed by a digit. PERIOD_COMMA_FOLLOW = re.compile(r"([\.,])([^0-9])"), " \\1 \\2" # Tokenize dash when preceded by a digit DASH_PRECEED_DIGIT = re.compile("([0-9])(-)"), "\\1 \\2 " LANG_DEPENDENT_REGEXES = [ PUNCT, PERIOD_COMMA_PRECEED, PERIOD_COMMA_FOLLOW, DASH_PRECEED_DIGIT, ] # Perluniprops characters used in NIST tokenizer. pup_number = str("".join(set(perluniprops.chars("Number")))) # i.e. \p{N} pup_punct = str("".join(set(perluniprops.chars("Punctuation")))) # i.e. \p{P} pup_symbol = str("".join(set(perluniprops.chars("Symbol")))) # i.e. \p{S} # Python regexes needs to escape some special symbols, see # see number_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_number) punct_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_punct) symbol_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_symbol) # Note: In the original perl implementation, \p{Z} and \p{Zl} were used to # (i) strip trailing and heading spaces and # (ii) de-deuplicate spaces. # In Python, this would do: ' '.join(str.strip().split()) # Thus, the next two lines were commented out. # Line_Separator = str(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl} # Separator = str(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z} # Pads non-ascii strings with space. NONASCII = re.compile("([\x00-\x7f]+)"), r" \1 " # Tokenize any punctuation unless followed AND preceded by a digit. PUNCT_1 = ( re.compile(f"([{number_regex}])([{punct_regex}])"), "\\1 \\2 ", ) PUNCT_2 = ( re.compile(f"([{punct_regex}])([{number_regex}])"), " \\1 \\2", ) # Tokenize symbols SYMBOLS = re.compile(f"([{symbol_regex}])"), " \\1 " INTERNATIONAL_REGEXES = [NONASCII, PUNCT_1, PUNCT_2, SYMBOLS]
[docs] def lang_independent_sub(self, text): """Performs the language independent string substituitions.""" # It's a strange order of regexes. # It'll be better to unescape after STRIP_EOL_HYPHEN # but let's keep it close to the original NIST implementation. regexp, substitution = self.STRIP_SKIP text = regexp.sub(substitution, text) text = xml_unescape(text) regexp, substitution = self.STRIP_EOL_HYPHEN text = regexp.sub(substitution, text) return text
[docs] def tokenize(self, text, lowercase=False, western_lang=True, return_str=False): text = str(text) # Language independent regex. text = self.lang_independent_sub(text) # Language dependent regex. if western_lang: # Pad string with whitespace. text = " " + text + " " if lowercase: text = text.lower() for regexp, substitution in self.LANG_DEPENDENT_REGEXES: text = regexp.sub(substitution, text) # Remove contiguous whitespaces. text = " ".join(text.split()) # Finally, strips heading and trailing spaces # and converts output string into unicode. text = str(text.strip()) return text if return_str else text.split()
[docs] def international_tokenize( self, text, lowercase=False, split_non_ascii=True, return_str=False ): text = str(text) # Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied # first before unescaping. regexp, substitution = self.STRIP_SKIP text = regexp.sub(substitution, text) regexp, substitution = self.STRIP_EOL_HYPHEN text = regexp.sub(substitution, text) text = xml_unescape(text) if lowercase: text = text.lower() for regexp, substitution in self.INTERNATIONAL_REGEXES: text = regexp.sub(substitution, text) # Make sure that there's only one space only between words. # Strip leading and trailing spaces. text = " ".join(text.strip().split()) return text if return_str else text.split()