Source code for nltk.metrics.windowdiff
# Natural Language Toolkit: Windowdiff
#
# Copyright (C) 2001-2012 NLTK Project
# Author: Edward Loper <edloper@gradient.cis.upenn.edu>
# Steven Bird <sb@csse.unimelb.edu.au>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT
##########################################################################
# Windowdiff
# Pevzner, L., and Hearst, M., A Critique and Improvement of
# an Evaluation Metric for Text Segmentation,
# Computational Linguistics,, 28 (1), March 2002, pp. 19-36
##########################################################################
from __future__ import print_function
[docs]def windowdiff(seg1, seg2, k, boundary="1"):
"""
Compute the windowdiff score for a pair of segmentations. A segmentation is any sequence
over a vocabulary of two items (e.g. "0", "1"), where the specified boundary value is used
to mark the edge of a segmentation.
>>> from nltk.metrics.windowdiff import windowdiff
>>> s1 = "00000010000000001000000"
>>> s2 = "00000001000000010000000"
>>> s3 = "00010000000000000001000"
>>> windowdiff(s1, s1, 3)
0
>>> windowdiff(s1, s2, 3)
4
>>> windowdiff(s2, s3, 3)
16
:param seg1: a segmentation
:type seg1: str or list
:param seg2: a segmentation
:type seg2: str or list
:param k: window width
:type k: int
:param boundary: boundary value
:type boundary: str or int or bool
:rtype: int
"""
if len(seg1) != len(seg2):
raise ValueError("Segmentations have unequal length")
wd = 0
for i in range(len(seg1) - k):
wd += abs(seg1[i:i+k+1].count(boundary) - seg2[i:i+k+1].count(boundary))
return wd
[docs]def demo():
s1 = "00000010000000001000000"
s2 = "00000001000000010000000"
s3 = "00010000000000000001000"
print("s1:", s1)
print("s2:", s2)
print("s3:", s3)
print("windowdiff(s1, s1, 3) = ", windowdiff(s1, s1, 3))
print("windowdiff(s1, s2, 3) = ", windowdiff(s1, s2, 3))
print("windowdiff(s2, s3, 3) = ", windowdiff(s2, s3, 3))