Source code for nltk.metrics.aline

# Natural Language Toolkit: ALINE
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Greg Kondrak <gkondrak@ualberta.ca>
#         Geoff Bacon <bacon@berkeley.edu> (Python port)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

"""
ALINE
https://webdocs.cs.ualberta.ca/~kondrak/
Copyright 2002 by Grzegorz Kondrak.

ALINE is an algorithm for aligning phonetic sequences, described in [1].
This module is a port of Kondrak's (2002) ALINE. It provides functions for
phonetic sequence alignment and similarity analysis. These are useful in
historical linguistics, sociolinguistics and synchronic phonology.

ALINE has parameters that can be tuned for desired output. These parameters are:
- C_skip, C_sub, C_exp, C_vwl
- Salience weights
- Segmental features

In this implementation, some parameters have been changed from their default
values as described in [1], in order to replicate published results. All changes
are noted in comments.

Example usage
-------------

# Get optimal alignment of two phonetic sequences

>>> align('θin', 'tenwis') # doctest: +SKIP
[[('θ', 't'), ('i', 'e'), ('n', 'n'), ('-', 'w'), ('-', 'i'), ('-', 's')]]

[1] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation,
University of Toronto.
"""

try:
    import numpy as np
except ImportError:
    np = None

# === Constants ===

inf = float("inf")

# Default values for maximum similarity scores (Kondrak 2002: 54)
C_skip = -10  # Indels
C_sub = 35  # Substitutions
C_exp = 45  # Expansions/compressions
C_vwl = 5  # Vowel/consonant relative weight (decreased from 10)

consonants = [
    "B",
    "N",
    "R",
    "b",
    "c",
    "d",
    "f",
    "g",
    "h",
    "j",
    "k",
    "l",
    "m",
    "n",
    "p",
    "q",
    "r",
    "s",
    "t",
    "v",
    "x",
    "z",
    "ç",
    "ð",
    "ħ",
    "ŋ",
    "ɖ",
    "ɟ",
    "ɢ",
    "ɣ",
    "ɦ",
    "ɬ",
    "ɮ",
    "ɰ",
    "ɱ",
    "ɲ",
    "ɳ",
    "ɴ",
    "ɸ",
    "ɹ",
    "ɻ",
    "ɽ",
    "ɾ",
    "ʀ",
    "ʁ",
    "ʂ",
    "ʃ",
    "ʈ",
    "ʋ",
    "ʐ ",
    "ʒ",
    "ʔ",
    "ʕ",
    "ʙ",
    "ʝ",
    "β",
    "θ",
    "χ",
    "ʐ",
    "w",
]

# Relevant features for comparing consonants and vowels
R_c = [
    "aspirated",
    "lateral",
    "manner",
    "nasal",
    "place",
    "retroflex",
    "syllabic",
    "voice",
]
# 'high' taken out of R_v because same as manner
R_v = [
    "back",
    "lateral",
    "long",
    "manner",
    "nasal",
    "place",
    "retroflex",
    "round",
    "syllabic",
    "voice",
]

# Flattened feature matrix (Kondrak 2002: 56)
similarity_matrix = {
    # place
    "bilabial": 1.0,
    "labiodental": 0.95,
    "dental": 0.9,
    "alveolar": 0.85,
    "retroflex": 0.8,
    "palato-alveolar": 0.75,
    "palatal": 0.7,
    "velar": 0.6,
    "uvular": 0.5,
    "pharyngeal": 0.3,
    "glottal": 0.1,
    "labiovelar": 1.0,
    "vowel": -1.0,  # added 'vowel'
    # manner
    "stop": 1.0,
    "affricate": 0.9,
    "fricative": 0.85,  # increased fricative from 0.8
    "trill": 0.7,
    "tap": 0.65,
    "approximant": 0.6,
    "high vowel": 0.4,
    "mid vowel": 0.2,
    "low vowel": 0.0,
    "vowel2": 0.5,  # added vowel
    # high
    "high": 1.0,
    "mid": 0.5,
    "low": 0.0,
    # back
    "front": 1.0,
    "central": 0.5,
    "back": 0.0,
    # binary features
    "plus": 1.0,
    "minus": 0.0,
}

# Relative weights of phonetic features (Kondrak 2002: 55)
salience = {
    "syllabic": 5,
    "place": 40,
    "manner": 50,
    "voice": 5,  # decreased from 10
    "nasal": 20,  # increased from 10
    "retroflex": 10,
    "lateral": 10,
    "aspirated": 5,
    "long": 0,  # decreased from 1
    "high": 3,  # decreased from 5
    "back": 2,  # decreased from 5
    "round": 2,  # decreased from 5
}

# (Kondrak 2002: 59-60)
feature_matrix = {
    # Consonants
    "p": {
        "place": "bilabial",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "minus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "b": {
        "place": "bilabial",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "t": {
        "place": "alveolar",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "minus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "d": {
        "place": "alveolar",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ʈ": {
        "place": "retroflex",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "minus",
        "nasal": "minus",
        "retroflex": "plus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ɖ": {
        "place": "retroflex",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "plus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "c": {
        "place": "palatal",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "minus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ɟ": {
        "place": "palatal",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "k": {
        "place": "velar",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "minus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "g": {
        "place": "velar",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "q": {
        "place": "uvular",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "minus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ɢ": {
        "place": "uvular",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ʔ": {
        "place": "glottal",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "minus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "m": {
        "place": "bilabial",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "plus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ɱ": {
        "place": "labiodental",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "plus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "n": {
        "place": "alveolar",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "plus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ɳ": {
        "place": "retroflex",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "plus",
        "retroflex": "plus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ɲ": {
        "place": "palatal",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "plus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ŋ": {
        "place": "velar",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "plus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ɴ": {
        "place": "uvular",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "plus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "N": {
        "place": "uvular",
        "manner": "stop",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "plus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ʙ": {
        "place": "bilabial",
        "manner": "trill",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "B": {
        "place": "bilabial",
        "manner": "trill",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "r": {
        "place": "alveolar",
        "manner": "trill",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "plus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ʀ": {
        "place": "uvular",
        "manner": "trill",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "R": {
        "place": "uvular",
        "manner": "trill",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ɾ": {
        "place": "alveolar",
        "manner": "tap",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ɽ": {
        "place": "retroflex",
        "manner": "tap",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "plus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ɸ": {
        "place": "bilabial",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "minus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "β": {
        "place": "bilabial",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "f": {
        "place": "labiodental",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "minus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "v": {
        "place": "labiodental",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "θ": {
        "place": "dental",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "minus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ð": {
        "place": "dental",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "s": {
        "place": "alveolar",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "minus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "z": {
        "place": "alveolar",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ʃ": {
        "place": "palato-alveolar",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "minus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ʒ": {
        "place": "palato-alveolar",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ʂ": {
        "place": "retroflex",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "minus",
        "nasal": "minus",
        "retroflex": "plus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ʐ": {
        "place": "retroflex",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "plus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ç": {
        "place": "palatal",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "minus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ʝ": {
        "place": "palatal",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "x": {
        "place": "velar",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "minus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ɣ": {
        "place": "velar",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "χ": {
        "place": "uvular",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "minus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ʁ": {
        "place": "uvular",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ħ": {
        "place": "pharyngeal",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "minus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ʕ": {
        "place": "pharyngeal",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "h": {
        "place": "glottal",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "minus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ɦ": {
        "place": "glottal",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ɬ": {
        "place": "alveolar",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "minus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "plus",
        "aspirated": "minus",
    },
    "ɮ": {
        "place": "alveolar",
        "manner": "fricative",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "plus",
        "aspirated": "minus",
    },
    "ʋ": {
        "place": "labiodental",
        "manner": "approximant",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ɹ": {
        "place": "alveolar",
        "manner": "approximant",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ɻ": {
        "place": "retroflex",
        "manner": "approximant",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "plus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "j": {
        "place": "palatal",
        "manner": "approximant",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "ɰ": {
        "place": "velar",
        "manner": "approximant",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    "l": {
        "place": "alveolar",
        "manner": "approximant",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "plus",
        "aspirated": "minus",
    },
    "w": {
        "place": "labiovelar",
        "manner": "approximant",
        "syllabic": "minus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "aspirated": "minus",
    },
    # Vowels
    "i": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "high",
        "back": "front",
        "round": "minus",
        "long": "minus",
        "aspirated": "minus",
    },
    "y": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "high",
        "back": "front",
        "round": "plus",
        "long": "minus",
        "aspirated": "minus",
    },
    "e": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "mid",
        "back": "front",
        "round": "minus",
        "long": "minus",
        "aspirated": "minus",
    },
    "E": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "mid",
        "back": "front",
        "round": "minus",
        "long": "plus",
        "aspirated": "minus",
    },
    "ø": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "mid",
        "back": "front",
        "round": "plus",
        "long": "minus",
        "aspirated": "minus",
    },
    "ø̞": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "mid",
        "back": "front",
        "round": "plus",
        "long": "minus",
        "aspirated": "minus",
    },
    "ɛ": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "mid",
        "back": "front",
        "round": "minus",
        "long": "minus",
        "aspirated": "minus",
    },
    "œ": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "mid",
        "back": "front",
        "round": "plus",
        "long": "minus",
        "aspirated": "minus",
    },
    "æ": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "low",
        "back": "front",
        "round": "minus",
        "long": "minus",
        "aspirated": "minus",
    },
    "a": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "low",
        "back": "front",
        "round": "minus",
        "long": "minus",
        "aspirated": "minus",
    },
    "ä": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "low",
        "back": "central",
        "round": "minus",
        "long": "minus",
        "aspirated": "minus",
    },
    "ɐ": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "low",
        "back": "central",
        "round": "minus",
        "long": "minus",
        "aspirated": "minus",
    },
    "ɶ": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "low",
        "back": "front",
        "round": "plus",
        "long": "minus",
        "aspirated": "minus",
    },
    "A": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "low",
        "back": "front",
        "round": "minus",
        "long": "plus",
        "aspirated": "minus",
    },
    "ɨ": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "high",
        "back": "central",
        "round": "minus",
        "long": "minus",
        "aspirated": "minus",
    },
    "ʉ": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "high",
        "back": "central",
        "round": "plus",
        "long": "minus",
        "aspirated": "minus",
    },
    "ə": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "mid",
        "back": "central",
        "round": "minus",
        "long": "minus",
        "aspirated": "minus",
    },
    "ɜ": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "mid",
        "back": "central",
        "round": "minus",
        "long": "minus",
        "aspirated": "minus",
    },
    "ɞ": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "mid",
        "back": "central",
        "round": "plus",
        "long": "minus",
        "aspirated": "minus",
    },
    "u": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "high",
        "back": "back",
        "round": "plus",
        "long": "minus",
        "aspirated": "minus",
    },
    "U": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "high",
        "back": "back",
        "round": "plus",
        "long": "plus",
        "aspirated": "minus",
    },
    "o": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "mid",
        "back": "back",
        "round": "plus",
        "long": "minus",
        "aspirated": "minus",
    },
    "o̞": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "mid",
        "back": "back",
        "round": "plus",
        "long": "minus",
        "aspirated": "minus",
    },
    "O": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "mid",
        "back": "back",
        "round": "plus",
        "long": "plus",
        "aspirated": "minus",
    },
    "ɔ": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "mid",
        "back": "back",
        "round": "plus",
        "long": "minus",
        "aspirated": "minus",
    },
    "ʌ": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "mid",
        "back": "back",
        "round": "minus",
        "long": "minus",
        "aspirated": "minus",
    },
    "ɒ": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "low",
        "back": "back",
        "round": "plus",
        "long": "minus",
        "aspirated": "minus",
    },
    "ɑ": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "low",
        "back": "back",
        "round": "minus",
        "long": "minus",
        "aspirated": "minus",
    },
    "I": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "high",
        "back": "front",
        "round": "minus",
        "long": "plus",
        "aspirated": "minus",
    },
    "ɯ": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "high",
        "back": "back",
        "round": "plus",
        "long": "minus",
        "aspirated": "minus",
    },
    "ʏ": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "high",
        "back": "front",
        "round": "plus",
        "long": "minus",
        "aspirated": "minus",
    },
    "ʊ": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "high",
        "back": "back",
        "round": "plus",
        "long": "minus",
        "aspirated": "minus",
    },
    "ɘ": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "high",
        "back": "central",
        "round": "minus",
        "long": "minus",
        "aspirated": "minus",
    },
    "e̞": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "mid",
        "back": "front",
        "round": "minus",
        "long": "minus",
        "aspirated": "minus",
    },
    "ɵ": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "high",
        "back": "central",
        "round": "plus",
        "long": "minus",
        "aspirated": "minus",
    },
    "ɤ": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "mid",
        "back": "back",
        "round": "minus",
        "long": "minus",
        "aspirated": "minus",
    },
    "ɤ̞": {
        "place": "vowel",
        "manner": "vowel2",
        "syllabic": "plus",
        "voice": "plus",
        "nasal": "minus",
        "retroflex": "minus",
        "lateral": "minus",
        "high": "mid",
        "back": "back",
        "round": "minus",
        "long": "minus",
        "aspirated": "minus",
    },
}

# === Algorithm ===


[docs] def align(str1, str2, epsilon=0): """ Compute the alignment of two phonetic strings. :param str str1: First string to be aligned :param str str2: Second string to be aligned :type epsilon: float (0.0 to 1.0) :param epsilon: Adjusts threshold similarity score for near-optimal alignments :rtype: list(list(tuple(str, str))) :return: Alignment(s) of str1 and str2 (Kondrak 2002: 51) """ if np is None: raise ImportError("You need numpy in order to use the align function") assert 0.0 <= epsilon <= 1.0, "Epsilon must be between 0.0 and 1.0." m = len(str1) n = len(str2) # This includes Kondrak's initialization of row 0 and column 0 to all 0s. S = np.zeros((m + 1, n + 1), dtype=float) # If i <= 1 or j <= 1, don't allow expansions as it doesn't make sense, # and breaks array and string indices. Make sure they never get chosen # by setting them to -inf. for i in range(1, m + 1): for j in range(1, n + 1): edit1 = S[i - 1, j] + sigma_skip(str1[i - 1]) edit2 = S[i, j - 1] + sigma_skip(str2[j - 1]) edit3 = S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) if i > 1: edit4 = S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) else: edit4 = -inf if j > 1: edit5 = S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) else: edit5 = -inf S[i, j] = max(edit1, edit2, edit3, edit4, edit5, 0) T = (1 - epsilon) * np.amax(S) # Threshold score for near-optimal alignments alignments = [] for i in range(1, m + 1): for j in range(1, n + 1): if S[i, j] >= T: alignments.append(_retrieve(i, j, 0, S, T, str1, str2, [])) return alignments
def _retrieve(i, j, s, S, T, str1, str2, out): """ Retrieve the path through the similarity matrix S starting at (i, j). :rtype: list(tuple(str, str)) :return: Alignment of str1 and str2 """ if S[i, j] == 0: return out else: if j > 1 and S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) + s >= T: out.insert(0, (str1[i - 1], str2[j - 2 : j])) _retrieve( i - 1, j - 2, s + sigma_exp(str1[i - 1], str2[j - 2 : j]), S, T, str1, str2, out, ) elif ( i > 1 and S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) + s >= T ): out.insert(0, (str1[i - 2 : i], str2[j - 1])) _retrieve( i - 2, j - 1, s + sigma_exp(str2[j - 1], str1[i - 2 : i]), S, T, str1, str2, out, ) elif S[i, j - 1] + sigma_skip(str2[j - 1]) + s >= T: out.insert(0, ("-", str2[j - 1])) _retrieve(i, j - 1, s + sigma_skip(str2[j - 1]), S, T, str1, str2, out) elif S[i - 1, j] + sigma_skip(str1[i - 1]) + s >= T: out.insert(0, (str1[i - 1], "-")) _retrieve(i - 1, j, s + sigma_skip(str1[i - 1]), S, T, str1, str2, out) elif S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + s >= T: out.insert(0, (str1[i - 1], str2[j - 1])) _retrieve( i - 1, j - 1, s + sigma_sub(str1[i - 1], str2[j - 1]), S, T, str1, str2, out, ) return out
[docs] def sigma_skip(p): """ Returns score of an indel of P. (Kondrak 2002: 54) """ return C_skip
[docs] def sigma_sub(p, q): """ Returns score of a substitution of P with Q. (Kondrak 2002: 54) """ return C_sub - delta(p, q) - V(p) - V(q)
[docs] def sigma_exp(p, q): """ Returns score of an expansion/compression. (Kondrak 2002: 54) """ q1 = q[0] q2 = q[1] return C_exp - delta(p, q1) - delta(p, q2) - V(p) - max(V(q1), V(q2))
[docs] def delta(p, q): """ Return weighted sum of difference between P and Q. (Kondrak 2002: 54) """ features = R(p, q) total = 0 if np is not None: return np.dot( [diff(p, q, f) for f in features], [salience[f] for f in features] ) for f in features: total += diff(p, q, f) * salience[f] return total
[docs] def diff(p, q, f): """ Returns difference between phonetic segments P and Q for feature F. (Kondrak 2002: 52, 54) """ p_features, q_features = feature_matrix[p], feature_matrix[q] return abs(similarity_matrix[p_features[f]] - similarity_matrix[q_features[f]])
[docs] def R(p, q): """ Return relevant features for segment comparison. (Kondrak 2002: 54) """ if p in consonants or q in consonants: return R_c return R_v
[docs] def V(p): """ Return vowel weight if P is vowel. (Kondrak 2002: 54) """ if p in consonants: return 0 return C_vwl
# === Test ===
[docs] def demo(): """ A demonstration of the result of aligning phonetic sequences used in Kondrak's (2002) dissertation. """ data = [pair.split(",") for pair in cognate_data.split("\n")] for pair in data: alignment = align(pair[0], pair[1])[0] alignment = [f"({a[0]}, {a[1]})" for a in alignment] alignment = " ".join(alignment) print(f"{pair[0]} ~ {pair[1]} : {alignment}")
cognate_data = """jo,ʒə tu,ty nosotros,nu kjen,ki ke,kwa todos,tu una,ən dos,dø tres,trwa ombre,om arbol,arbrə pluma,plym kabeθa,kap boka,buʃ pje,pje koraθon,kœr ber,vwar benir,vənir deθir,dir pobre,povrə ðis,dIzes ðæt,das wat,vas nat,nixt loŋ,laŋ mæn,man fleʃ,flajʃ bləd,blyt feðər,fEdər hær,hAr ir,Or aj,awgə nowz,nAzə mawθ,munt təŋ,tsuŋə fut,fys nij,knI hænd,hant hart,herts livər,lEbər ænd,ante æt,ad blow,flAre ir,awris ijt,edere fiʃ,piʃkis flow,fluere staɾ,stella ful,plenus græs,gramen hart,kordis horn,korny aj,ego nij,genU məðər,mAter mawntən,mons nejm,nomen njuw,nowus wən,unus rawnd,rotundus sow,suere sit,sedere θrij,tres tuwθ,dentis θin,tenwis kinwawa,kenuaʔ nina,nenah napewa,napɛw wapimini,wapemen namesa,namɛʔs okimawa,okemaw ʃiʃipa,seʔsep ahkohkwa,ahkɛh pematesiweni,pematesewen asenja,aʔsɛn""" if __name__ == "__main__": demo()