Source code for nltk.corpus.reader.bcp47

# Natural Language Toolkit: BCP-47 language tags
#
# Copyright (C) 2022-2023 NLTK Project
# Author: Eric Kafe <kafe.eric@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

import re
from warnings import warn
from xml.etree import ElementTree as et

from nltk.corpus.reader import CorpusReader



[docs]
class BCP47CorpusReader(CorpusReader):
    """
    Parse BCP-47 composite language tags

    Supports all the main subtags, and the 'u-sd' extension:

    >>> from nltk.corpus import bcp47
    >>> bcp47.name('oc-gascon-u-sd-fr64')
    'Occitan (post 1500): Gascon: Pyrénées-Atlantiques'

    Can load a conversion table to Wikidata Q-codes:
    >>> bcp47.load_wiki_q()
    >>> bcp47.wiki_q['en-GI-spanglis']
    'Q79388'

    """


[docs]
    def __init__(self, root, fileids):
        """Read the BCP-47 database"""
        super().__init__(root, fileids)
        self.langcode = {}
        with self.open("iana/language-subtag-registry.txt") as fp:
            self.db = self.data_dict(fp.read().split("%%\n"))
        with self.open("cldr/common-subdivisions-en.xml") as fp:
            self.subdiv = self.subdiv_dict(
                et.parse(fp).iterfind("localeDisplayNames/subdivisions/subdivision")
            )
        self.morphology()



[docs]
    def load_wiki_q(self):
        """Load conversion table to Wikidata Q-codes (only if needed)"""
        with self.open("cldr/tools-cldr-rdf-external-entityToCode.tsv") as fp:
            self.wiki_q = self.wiki_dict(fp.read().strip().split("\n")[1:])



[docs]
    def wiki_dict(self, lines):
        """Convert Wikidata list of Q-codes to a BCP-47 dictionary"""
        return {
            pair[1]: pair[0].split("/")[-1]
            for pair in [line.strip().split("\t") for line in lines]
        }



[docs]
    def subdiv_dict(self, subdivs):
        """Convert the CLDR subdivisions list to a dictionary"""
        return {sub.attrib["type"]: sub.text for sub in subdivs}



[docs]
    def morphology(self):
        self.casing = {
            "language": str.lower,
            "extlang": str.lower,
            "script": str.title,
            "region": str.upper,
            "variant": str.lower,
        }
        dig = "[0-9]"
        low = "[a-z]"
        up = "[A-Z]"
        alnum = "[a-zA-Z0-9]"
        self.format = {
            "language": re.compile(f"{low*3}?"),
            "extlang": re.compile(f"{low*3}"),
            "script": re.compile(f"{up}{low*3}"),
            "region": re.compile(f"({up*2})|({dig*3})"),
            "variant": re.compile(f"{alnum*4}{(alnum+'?')*4}"),
            "singleton": re.compile(f"{low}"),
        }



[docs]
    def data_dict(self, records):
        """Convert the BCP-47 language subtag registry to a dictionary"""
        self.version = records[0].replace("File-Date:", "").strip()
        dic = {}
        dic["deprecated"] = {}
        for label in [
            "language",
            "extlang",
            "script",
            "region",
            "variant",
            "redundant",
            "grandfathered",
        ]:
            dic["deprecated"][label] = {}
        for record in records[1:]:
            fields = [field.split(": ") for field in record.strip().split("\n")]
            typ = fields[0][1]
            tag = fields[1][1]
            if typ not in dic:
                dic[typ] = {}
            subfields = {}
            for field in fields[2:]:
                if len(field) == 2:
                    [key, val] = field
                    if key not in subfields:
                        subfields[key] = [val]
                    else:  # multiple value
                        subfields[key].append(val)
                else:  # multiline field
                    subfields[key][-1] += " " + field[0].strip()
                if (
                    "Deprecated" not in record
                    and typ == "language"
                    and key == "Description"
                ):
                    self.langcode[subfields[key][-1]] = tag
            for key in subfields:
                if len(subfields[key]) == 1:  # single value
                    subfields[key] = subfields[key][0]
            if "Deprecated" in record:
                dic["deprecated"][typ][tag] = subfields
            else:
                dic[typ][tag] = subfields
        return dic



[docs]
    def val2str(self, val):
        """Return only first value"""
        if type(val) == list:
            #            val = "/".join(val) # Concatenate all values
            val = val[0]
        return val



[docs]
    def lang2str(self, lg_record):
        """Concatenate subtag values"""
        name = f"{lg_record['language']}"
        for label in ["extlang", "script", "region", "variant", "extension"]:
            if label in lg_record:
                name += f": {lg_record[label]}"
        return name



[docs]
    def parse_tag(self, tag):
        """Convert a BCP-47 tag to a dictionary of labelled subtags"""
        subtags = tag.split("-")
        lang = {}
        labels = ["language", "extlang", "script", "region", "variant", "variant"]
        while subtags and labels:
            subtag = subtags.pop(0)
            found = False
            while labels:
                label = labels.pop(0)
                subtag = self.casing[label](subtag)
                if self.format[label].fullmatch(subtag):
                    if subtag in self.db[label]:
                        found = True
                        valstr = self.val2str(self.db[label][subtag]["Description"])
                        if label == "variant" and label in lang:
                            lang[label] += ": " + valstr
                        else:
                            lang[label] = valstr
                        break
                    elif subtag in self.db["deprecated"][label]:
                        found = True
                        note = f"The {subtag!r} {label} code is deprecated"
                        if "Preferred-Value" in self.db["deprecated"][label][subtag]:
                            prefer = self.db["deprecated"][label][subtag][
                                "Preferred-Value"
                            ]
                            note += f"', prefer '{self.val2str(prefer)}'"
                        lang[label] = self.val2str(
                            self.db["deprecated"][label][subtag]["Description"]
                        )
                        warn(note)
                        break
            if not found:
                if subtag == "u" and subtags[0] == "sd":  # CLDR regional subdivisions
                    sd = subtags[1]
                    if sd in self.subdiv:
                        ext = self.subdiv[sd]
                    else:
                        ext = f"<Unknown subdivision: {ext}>"
                else:  # other extension subtags are not supported yet
                    ext = f"{subtag}{''.join(['-'+ext for ext in subtags])}".lower()
                    if not self.format["singleton"].fullmatch(subtag):
                        ext = f"<Invalid extension: {ext}>"
                        warn(ext)
                lang["extension"] = ext
                subtags = []
        return lang



[docs]
    def name(self, tag):
        """
        Convert a BCP-47 tag to a colon-separated string of subtag names

        >>> from nltk.corpus import bcp47
        >>> bcp47.name('ca-Latn-ES-valencia')
        'Catalan: Latin: Spain: Valencian'

        """
        for label in ["redundant", "grandfathered"]:
            val = None
            if tag in self.db[label]:
                val = f"{self.db[label][tag]['Description']}"
                note = f"The {tag!r} code is {label}"
            elif tag in self.db["deprecated"][label]:
                val = f"{self.db['deprecated'][label][tag]['Description']}"
                note = f"The {tag!r} code is {label} and deprecated"
                if "Preferred-Value" in self.db["deprecated"][label][tag]:
                    prefer = self.db["deprecated"][label][tag]["Preferred-Value"]
                    note += f", prefer {self.val2str(prefer)!r}"
            if val:
                warn(note)
                return val
        try:
            return self.lang2str(self.parse_tag(tag))
        except:
            warn(f"Tag {tag!r} was not recognized")
            return None