Source code for nltk.corpus.reader.xmldocs

# Natural Language Toolkit: XML Corpus Reader
# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <>
# URL: <>
# For license information, see LICENSE.TXT

Corpus reader for corpora whose documents are xml files.

(note -- not named 'xml' to avoid conflicting w/ standard xml package)
from __future__ import print_function, unicode_literals

import codecs

# Use the c version of ElementTree, which is faster, if possible:
try: from xml.etree import cElementTree as ElementTree
except ImportError: from xml.etree import ElementTree

from nltk import compat
from import SeekableUnicodeStreamReader
from nltk.tokenize import WordPunctTokenizer
from nltk.internals import ElementWrapper

from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import *

[docs]class XMLCorpusReader(CorpusReader): """ Corpus reader for corpora whose documents are xml files. Note that the ``XMLCorpusReader`` constructor does not take an ``encoding`` argument, because the unicode encoding is specified by the XML files themselves. See the XML specs for more info. """ def __init__(self, root, fileids, wrap_etree=False): self._wrap_etree = wrap_etree CorpusReader.__init__(self, root, fileids)
[docs] def xml(self, fileid=None): # Make sure we have exactly one file -- no concatenating XML. if fileid is None and len(self._fileids) == 1: fileid = self._fileids[0] if not isinstance(fileid, compat.string_types): raise TypeError('Expected a single file identifier string') # Read the XML in using ElementTree. elt = ElementTree.parse(self.abspath(fileid).open()).getroot() # If requested, wrap it. if self._wrap_etree: elt = ElementWrapper(elt) # Return the ElementTree element. return elt
[docs] def words(self, fileid=None): """ Returns all of the words and punctuation symbols in the specified file that were in text nodes -- ie, tags are ignored. Like the xml() method, fileid can only specify one file. :return: the given file's text nodes as a list of words and punctuation symbols :rtype: list(str) """ elt = self.xml(fileid) encoding = self.encoding(fileid) word_tokenizer=WordPunctTokenizer() iterator = elt.getiterator() out = [] for node in iterator: text = node.text if text is not None: if isinstance(text, bytes): text = text.decode(encoding) toks = word_tokenizer.tokenize(text) out.extend(toks) return out
[docs] def raw(self, fileids=None): if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([ for f in fileids])
[docs]class XMLCorpusView(StreamBackedCorpusView): """ A corpus view that selects out specified elements from an XML file, and provides a flat list-like interface for accessing them. (Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself, but may be used by subclasses of ``XMLCorpusReader``.) Every XML corpus view has a "tag specification", indicating what XML elements should be included in the view; and each (non-nested) element that matches this specification corresponds to one item in the view. Tag specifications are regular expressions over tag paths, where a tag path is a list of element tag names, separated by '/', indicating the ancestry of the element. Some examples: - ``'foo'``: A top-level element whose tag is ``foo``. - ``'foo/bar'``: An element whose tag is ``bar`` and whose parent is a top-level element whose tag is ``foo``. - ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere in the xml tree. - ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``, appearing anywhere in the xml tree. The view items are generated from the selected XML elements via the method ``handle_elt()``. By default, this method returns the element as-is (i.e., as an ElementTree object); but it can be overridden, either via subclassing or via the ``elt_handler`` constructor parameter. """ #: If true, then display debugging output to stdout when reading #: blocks. _DEBUG = False #: The number of characters read at a time by this corpus reader. _BLOCK_SIZE = 1024 def __init__(self, fileid, tagspec, elt_handler=None): """ Create a new corpus view based on a specified XML file. Note that the ``XMLCorpusView`` constructor does not take an ``encoding`` argument, because the unicode encoding is specified by the XML files themselves. :type tagspec: str :param tagspec: A tag specification, indicating what XML elements should be included in the view. Each non-nested element that matches this specification corresponds to one item in the view. :param elt_handler: A function used to transform each element to a value for the view. If no handler is specified, then ``self.handle_elt()`` is called, which returns the element as an ElementTree object. The signature of elt_handler is:: elt_handler(elt, tagspec) -> value """ if elt_handler: self.handle_elt = elt_handler self._tagspec = re.compile(tagspec+r'\Z') """The tag specification for this corpus view.""" self._tag_context = {0: ()} """A dictionary mapping from file positions (as returned by ```` to XML contexts. An XML context is a tuple of XML tag names, indicating which tags have not yet been closed.""" encoding = self._detect_encoding(fileid) StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) def _detect_encoding(self, fileid): if isinstance(fileid, PathPointer): s = else: with open(fileid, 'rb') as infile: s = infile.readline() if s.startswith(codecs.BOM_UTF16_BE): return 'utf-16-be' if s.startswith(codecs.BOM_UTF16_LE): return 'utf-16-le' if s.startswith(codecs.BOM_UTF32_BE): return 'utf-32-be' if s.startswith(codecs.BOM_UTF32_LE): return 'utf-32-le' if s.startswith(codecs.BOM_UTF8): return 'utf-8' m = re.match(br'\s*<\?xml\b.*\bencoding="([^"]+)"', s) if m: return m = re.match(br"\s*<\?xml\b.*\bencoding='([^']+)'", s) if m: return # No encoding found -- what should the default be? return 'utf-8'
[docs] def handle_elt(self, elt, context): """ Convert an element into an appropriate value for inclusion in the view. Unless overridden by a subclass or by the ``elt_handler`` constructor argument, this method simply returns ``elt``. :return: The view value corresponding to ``elt``. :type elt: ElementTree :param elt: The element that should be converted. :type context: str :param context: A string composed of element tags separated by forward slashes, indicating the XML context of the given element. For example, the string ``'foo/bar/baz'`` indicates that the element is a ``baz`` element whose parent is a ``bar`` element and whose grandparent is a top-level ``foo`` element. """ return elt
#: A regular expression that matches XML fragments that do not #: contain any un-closed tags. _VALID_XML_RE = re.compile(r""" [^<]* ( ((<!--.*?-->) | # comment (<![CDATA[.*?]]) | # raw character data (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) | # doctype decl (<[^!>][^>]*>)) # tag or PI [^<]*)* \Z""", re.DOTALL|re.VERBOSE) #: A regular expression used to extract the tag name from a start tag, #: end tag, or empty-elt tag string. _XML_TAG_NAME = re.compile('<\s*/?\s*([^\s>]+)') #: A regular expression used to find all start-tags, end-tags, and #: emtpy-elt tags in an XML file. This regexp is more lenient than #: the XML spec -- e.g., it allows spaces in some places where the #: spec does not. _XML_PIECE = re.compile(r""" # Include these so we can skip them: (?P<COMMENT> <!--.*?--> )| (?P<CDATA> <![CDATA[.*?]]> )| (?P<PI> <\?.*?\?> )| (?P<DOCTYPE> <!DOCTYPE\s+[^\[^>]*(\[[^\]]*])?\s*>)| # These are the ones we actually care about: (?P<EMPTY_ELT_TAG> <\s*[^>/\?!\s][^>]*/\s*> )| (?P<START_TAG> <\s*[^>/\?!\s][^>]*> )| (?P<END_TAG> <\s*/[^>/\?!\s][^>]*> )""", re.DOTALL|re.VERBOSE) def _read_xml_fragment(self, stream): """ Read a string from the given stream that does not contain any un-closed tags. In particular, this function first reads a block from the stream of size ``self._BLOCK_SIZE``. It then checks if that block contains an un-closed tag. If it does, then this function either backtracks to the last '<', or reads another block. """ fragment = '' if isinstance(stream, SeekableUnicodeStreamReader): startpos = stream.tell() while True: # Read a block and add it to the fragment. xml_block = fragment += xml_block # Do we have a well-formed xml fragment? if self._VALID_XML_RE.match(fragment): return fragment # Do we have a fragment that will never be well-formed? if'[<>]', fragment).group(0) == '>': pos = stream.tell() - ( len(fragment)'[<>]', fragment).end()) raise ValueError('Unexpected ">" near char %s' % pos) # End of file? if not xml_block: raise ValueError('Unexpected end of file: tag not closed') # If not, then we must be in the middle of a <..tag..>. # If appropriate, backtrack to the most recent '<' # character. last_open_bracket = fragment.rfind('<') if last_open_bracket > 0: if self._VALID_XML_RE.match(fragment[:last_open_bracket]): if isinstance(stream, SeekableUnicodeStreamReader): stream.char_seek_forward(last_open_bracket) else:, 1) return fragment[:last_open_bracket] # Otherwise, read another block. (i.e., return to the # top of the loop.)
[docs] def read_block(self, stream, tagspec=None, elt_handler=None): """ Read from ``stream`` until we find at least one element that matches ``tagspec``, and return the result of applying ``elt_handler`` to each element found. """ if tagspec is None: tagspec = self._tagspec if elt_handler is None: elt_handler = self.handle_elt # Use a stack of strings to keep track of our context: context = list(self._tag_context.get(stream.tell())) assert context is not None # check this -- could it ever happen? elts = [] elt_start = None # where does the elt start elt_depth = None # what context depth elt_text = '' while elts==[] or elt_start is not None: if isinstance(stream, SeekableUnicodeStreamReader): startpos = stream.tell() xml_fragment = self._read_xml_fragment(stream) # End of file. if not xml_fragment: if elt_start is None: break else: raise ValueError('Unexpected end of file') # Process each <tag> in the xml fragment. for piece in self._XML_PIECE.finditer(xml_fragment): if self._DEBUG: print('%25s %s' % ('/'.join(context)[-20:], if'START_TAG'): name = self._XML_TAG_NAME.match( # Keep context up-to-date. context.append(name) # Is this one of the elts we're looking for? if elt_start is None: if re.match(tagspec, '/'.join(context)): elt_start = piece.start() elt_depth = len(context) elif'END_TAG'): name = self._XML_TAG_NAME.match( # sanity checks: if not context: raise ValueError('Unmatched tag </%s>' % name) if name != context[-1]: raise ValueError('Unmatched tag <%s>...</%s>' % (context[-1], name)) # Is this the end of an element? if elt_start is not None and elt_depth == len(context): elt_text += xml_fragment[elt_start:piece.end()] elts.append( (elt_text, '/'.join(context)) ) elt_start = elt_depth = None elt_text = '' # Keep context up-to-date context.pop() elif'EMPTY_ELT_TAG'): name = self._XML_TAG_NAME.match( if elt_start is None: if re.match(tagspec, '/'.join(context)+'/'+name): elts.append((, '/'.join(context)+'/'+name)) if elt_start is not None: # If we haven't found any elements yet, then keep # looping until we do. if elts == []: elt_text += xml_fragment[elt_start:] elt_start = 0 # If we've found at least one element, then try # backtracking to the start of the element that we're # inside of. else: # take back the last start-tag, and return what # we've gotten so far (elts is non-empty). if self._DEBUG: print(' '*36+'(backtrack)') if isinstance(stream, SeekableUnicodeStreamReader): stream.char_seek_forward(elt_start) else:, 1) context = context[:elt_depth-1] elt_start = elt_depth = None elt_text = '' # Update the _tag_context dict. pos = stream.tell() if pos in self._tag_context: assert tuple(context) == self._tag_context[pos] else: self._tag_context[pos] = tuple(context) return [elt_handler(ElementTree.fromstring( elt.encode('ascii', 'xmlcharrefreplace')), context) for (elt, context) in elts]