week06
This commit is contained in:
438
env/lib/python3.12/site-packages/lxml/iterparse.pxi
vendored
Normal file
438
env/lib/python3.12/site-packages/lxml/iterparse.pxi
vendored
Normal file
@ -0,0 +1,438 @@
|
||||
# iterparse -- event-driven parsing
|
||||
|
||||
DEF __ITERPARSE_CHUNK_SIZE = 32768
|
||||
|
||||
cdef class iterparse:
|
||||
"""iterparse(self, source, events=("end",), tag=None, \
|
||||
attribute_defaults=False, dtd_validation=False, \
|
||||
load_dtd=False, no_network=True, remove_blank_text=False, \
|
||||
remove_comments=False, remove_pis=False, encoding=None, \
|
||||
html=False, recover=None, huge_tree=False, schema=None)
|
||||
|
||||
Incremental parser.
|
||||
|
||||
Parses XML into a tree and generates tuples (event, element) in a
|
||||
SAX-like fashion. ``event`` is any of 'start', 'end', 'start-ns',
|
||||
'end-ns'.
|
||||
|
||||
For 'start' and 'end', ``element`` is the Element that the parser just
|
||||
found opening or closing. For 'start-ns', it is a tuple (prefix, URI) of
|
||||
a new namespace declaration. For 'end-ns', it is simply None. Note that
|
||||
all start and end events are guaranteed to be properly nested.
|
||||
|
||||
The keyword argument ``events`` specifies a sequence of event type names
|
||||
that should be generated. By default, only 'end' events will be
|
||||
generated.
|
||||
|
||||
The additional ``tag`` argument restricts the 'start' and 'end' events to
|
||||
those elements that match the given tag. The ``tag`` argument can also be
|
||||
a sequence of tags to allow matching more than one tag. By default,
|
||||
events are generated for all elements. Note that the 'start-ns' and
|
||||
'end-ns' events are not impacted by this restriction.
|
||||
|
||||
The other keyword arguments in the constructor are mainly based on the
|
||||
libxml2 parser configuration. A DTD will also be loaded if validation or
|
||||
attribute default values are requested.
|
||||
|
||||
Available boolean keyword arguments:
|
||||
- attribute_defaults: read default attributes from DTD
|
||||
- dtd_validation: validate (if DTD is available)
|
||||
- load_dtd: use DTD for parsing
|
||||
- no_network: prevent network access for related files
|
||||
- remove_blank_text: discard blank text nodes
|
||||
- remove_comments: discard comments
|
||||
- remove_pis: discard processing instructions
|
||||
- strip_cdata: replace CDATA sections by normal text content (default: True)
|
||||
- compact: safe memory for short text content (default: True)
|
||||
- resolve_entities: replace entities by their text value (default: True)
|
||||
- huge_tree: disable security restrictions and support very deep trees
|
||||
and very long text content (only affects libxml2 2.7+)
|
||||
- html: parse input as HTML (default: XML)
|
||||
- recover: try hard to parse through broken input (default: True for HTML,
|
||||
False otherwise)
|
||||
|
||||
Other keyword arguments:
|
||||
- encoding: override the document encoding
|
||||
- schema: an XMLSchema to validate against
|
||||
"""
|
||||
cdef _FeedParser _parser
|
||||
cdef object _tag
|
||||
cdef object _events
|
||||
cdef readonly object root
|
||||
cdef object _source
|
||||
cdef object _filename
|
||||
cdef object _error
|
||||
cdef bint _close_source_after_read
|
||||
|
||||
def __init__(self, source, events=("end",), *, tag=None,
|
||||
attribute_defaults=False, dtd_validation=False,
|
||||
load_dtd=False, no_network=True, remove_blank_text=False,
|
||||
compact=True, resolve_entities=True, remove_comments=False,
|
||||
remove_pis=False, strip_cdata=True, encoding=None,
|
||||
html=False, recover=None, huge_tree=False, collect_ids=True,
|
||||
XMLSchema schema=None):
|
||||
if not hasattr(source, 'read'):
|
||||
source = _getFSPathOrObject(source)
|
||||
self._filename = source
|
||||
self._source = open(source, 'rb')
|
||||
self._close_source_after_read = True
|
||||
else:
|
||||
self._filename = _getFilenameForFile(source)
|
||||
self._source = source
|
||||
self._close_source_after_read = False
|
||||
|
||||
if recover is None:
|
||||
recover = html
|
||||
|
||||
if html:
|
||||
# make sure we're not looking for namespaces
|
||||
events = [event for event in events
|
||||
if event not in ('start-ns', 'end-ns')]
|
||||
parser = HTMLPullParser(
|
||||
events,
|
||||
tag=tag,
|
||||
recover=recover,
|
||||
base_url=self._filename,
|
||||
encoding=encoding,
|
||||
remove_blank_text=remove_blank_text,
|
||||
remove_comments=remove_comments,
|
||||
remove_pis=remove_pis,
|
||||
strip_cdata=strip_cdata,
|
||||
no_network=no_network,
|
||||
target=None, # TODO
|
||||
schema=schema,
|
||||
compact=compact)
|
||||
else:
|
||||
parser = XMLPullParser(
|
||||
events,
|
||||
tag=tag,
|
||||
recover=recover,
|
||||
base_url=self._filename,
|
||||
encoding=encoding,
|
||||
attribute_defaults=attribute_defaults,
|
||||
dtd_validation=dtd_validation,
|
||||
load_dtd=load_dtd,
|
||||
no_network=no_network,
|
||||
schema=schema,
|
||||
huge_tree=huge_tree,
|
||||
remove_blank_text=remove_blank_text,
|
||||
resolve_entities=resolve_entities,
|
||||
remove_comments=remove_comments,
|
||||
remove_pis=remove_pis,
|
||||
strip_cdata=strip_cdata,
|
||||
collect_ids=True,
|
||||
target=None, # TODO
|
||||
compact=compact)
|
||||
|
||||
self._events = parser.read_events()
|
||||
self._parser = parser
|
||||
|
||||
@property
|
||||
def error_log(self):
|
||||
"""The error log of the last (or current) parser run.
|
||||
"""
|
||||
return self._parser.feed_error_log
|
||||
|
||||
@property
|
||||
def resolvers(self):
|
||||
"""The custom resolver registry of the last (or current) parser run.
|
||||
"""
|
||||
return self._parser.resolvers
|
||||
|
||||
@property
|
||||
def version(self):
|
||||
"""The version of the underlying XML parser."""
|
||||
return self._parser.version
|
||||
|
||||
def set_element_class_lookup(self, ElementClassLookup lookup = None):
|
||||
"""set_element_class_lookup(self, lookup = None)
|
||||
|
||||
Set a lookup scheme for element classes generated from this parser.
|
||||
|
||||
Reset it by passing None or nothing.
|
||||
"""
|
||||
self._parser.set_element_class_lookup(lookup)
|
||||
|
||||
def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
|
||||
"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
|
||||
|
||||
Creates a new element associated with this parser.
|
||||
"""
|
||||
self._parser.makeelement(
|
||||
_tag, attrib=None, nsmap=None, **_extra)
|
||||
|
||||
@cython.final
|
||||
cdef _close_source(self):
|
||||
if self._source is None:
|
||||
return
|
||||
if not self._close_source_after_read:
|
||||
self._source = None
|
||||
return
|
||||
try:
|
||||
close = self._source.close
|
||||
except AttributeError:
|
||||
close = None
|
||||
finally:
|
||||
self._source = None
|
||||
if close is not None:
|
||||
close()
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
try:
|
||||
return next(self._events)
|
||||
except StopIteration:
|
||||
pass
|
||||
context = <_SaxParserContext>self._parser._getPushParserContext()
|
||||
if self._source is not None:
|
||||
done = False
|
||||
while not done:
|
||||
try:
|
||||
done = self._read_more_events(context)
|
||||
return next(self._events)
|
||||
except StopIteration:
|
||||
pass # no events yet
|
||||
except Exception as e:
|
||||
self._error = e
|
||||
self._close_source()
|
||||
try:
|
||||
return next(self._events)
|
||||
except StopIteration:
|
||||
break
|
||||
# nothing left to read or return
|
||||
if self._error is not None:
|
||||
error = self._error
|
||||
self._error = None
|
||||
raise error
|
||||
if (context._validator is not None
|
||||
and not context._validator.isvalid()):
|
||||
_raiseParseError(context._c_ctxt, self._filename,
|
||||
context._error_log)
|
||||
# no errors => all done
|
||||
raise StopIteration
|
||||
|
||||
@cython.final
|
||||
cdef bint _read_more_events(self, _SaxParserContext context) except -123:
|
||||
data = self._source.read(__ITERPARSE_CHUNK_SIZE)
|
||||
if not isinstance(data, bytes):
|
||||
self._close_source()
|
||||
raise TypeError("reading file objects must return bytes objects")
|
||||
if not data:
|
||||
try:
|
||||
self.root = self._parser.close()
|
||||
finally:
|
||||
self._close_source()
|
||||
return True
|
||||
self._parser.feed(data)
|
||||
return False
|
||||
|
||||
|
||||
cdef enum _IterwalkSkipStates:
|
||||
IWSKIP_NEXT_IS_START
|
||||
IWSKIP_SKIP_NEXT
|
||||
IWSKIP_CAN_SKIP
|
||||
IWSKIP_CANNOT_SKIP
|
||||
|
||||
|
||||
cdef class iterwalk:
|
||||
"""iterwalk(self, element_or_tree, events=("end",), tag=None)
|
||||
|
||||
A tree walker that generates events from an existing tree as if it
|
||||
was parsing XML data with ``iterparse()``.
|
||||
|
||||
Just as for ``iterparse()``, the ``tag`` argument can be a single tag or a
|
||||
sequence of tags.
|
||||
|
||||
After receiving a 'start' or 'start-ns' event, the children and
|
||||
descendants of the current element can be excluded from iteration
|
||||
by calling the ``skip_subtree()`` method.
|
||||
"""
|
||||
cdef _MultiTagMatcher _matcher
|
||||
cdef list _node_stack
|
||||
cdef list _events
|
||||
cdef object _pop_event
|
||||
cdef object _include_siblings
|
||||
cdef int _index
|
||||
cdef int _event_filter
|
||||
cdef _IterwalkSkipStates _skip_state
|
||||
|
||||
def __init__(self, element_or_tree, events=("end",), tag=None):
|
||||
cdef _Element root
|
||||
cdef int ns_count
|
||||
root = _rootNodeOrRaise(element_or_tree)
|
||||
self._event_filter = _buildParseEventFilter(events)
|
||||
if tag is None or tag == '*':
|
||||
self._matcher = None
|
||||
else:
|
||||
self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag)
|
||||
self._node_stack = []
|
||||
self._events = []
|
||||
self._pop_event = self._events.pop
|
||||
self._skip_state = IWSKIP_CANNOT_SKIP # ignore all skip requests by default
|
||||
|
||||
if self._event_filter:
|
||||
self._index = 0
|
||||
if self._matcher is not None and self._event_filter & PARSE_EVENT_FILTER_START:
|
||||
self._matcher.cacheTags(root._doc)
|
||||
|
||||
# When processing an ElementTree, add events for the preceding comments/PIs.
|
||||
if self._event_filter & (PARSE_EVENT_FILTER_COMMENT | PARSE_EVENT_FILTER_PI):
|
||||
if isinstance(element_or_tree, _ElementTree):
|
||||
self._include_siblings = root
|
||||
for elem in list(root.itersiblings(preceding=True))[::-1]:
|
||||
if self._event_filter & PARSE_EVENT_FILTER_COMMENT and elem.tag is Comment:
|
||||
self._events.append(('comment', elem))
|
||||
elif self._event_filter & PARSE_EVENT_FILTER_PI and elem.tag is PI:
|
||||
self._events.append(('pi', elem))
|
||||
|
||||
ns_count = self._start_node(root)
|
||||
self._node_stack.append( (root, ns_count) )
|
||||
else:
|
||||
self._index = -1
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
cdef xmlNode* c_child
|
||||
cdef _Element node
|
||||
cdef _Element next_node
|
||||
cdef int ns_count = 0
|
||||
if self._events:
|
||||
return self._next_event()
|
||||
if self._matcher is not None and self._index >= 0:
|
||||
node = self._node_stack[self._index][0]
|
||||
self._matcher.cacheTags(node._doc)
|
||||
|
||||
# find next node
|
||||
while self._index >= 0:
|
||||
node = self._node_stack[self._index][0]
|
||||
|
||||
if self._skip_state == IWSKIP_SKIP_NEXT:
|
||||
c_child = NULL
|
||||
else:
|
||||
c_child = self._process_non_elements(
|
||||
node._doc, _findChildForwards(node._c_node, 0))
|
||||
self._skip_state = IWSKIP_CANNOT_SKIP
|
||||
|
||||
while c_child is NULL:
|
||||
# back off through parents
|
||||
self._index -= 1
|
||||
node = self._end_node()
|
||||
if self._index < 0:
|
||||
break
|
||||
c_child = self._process_non_elements(
|
||||
node._doc, _nextElement(node._c_node))
|
||||
|
||||
if c_child is not NULL:
|
||||
next_node = _elementFactory(node._doc, c_child)
|
||||
if self._event_filter & (PARSE_EVENT_FILTER_START |
|
||||
PARSE_EVENT_FILTER_START_NS):
|
||||
ns_count = self._start_node(next_node)
|
||||
elif self._event_filter & PARSE_EVENT_FILTER_END_NS:
|
||||
ns_count = _countNsDefs(next_node._c_node)
|
||||
self._node_stack.append( (next_node, ns_count) )
|
||||
self._index += 1
|
||||
if self._events:
|
||||
return self._next_event()
|
||||
|
||||
if self._include_siblings is not None:
|
||||
node, self._include_siblings = self._include_siblings, None
|
||||
self._process_non_elements(node._doc, _nextElement(node._c_node))
|
||||
if self._events:
|
||||
return self._next_event()
|
||||
|
||||
raise StopIteration
|
||||
|
||||
@cython.final
|
||||
cdef xmlNode* _process_non_elements(self, _Document doc, xmlNode* c_node):
|
||||
while c_node is not NULL and c_node.type != tree.XML_ELEMENT_NODE:
|
||||
if c_node.type == tree.XML_COMMENT_NODE:
|
||||
if self._event_filter & PARSE_EVENT_FILTER_COMMENT:
|
||||
self._events.append(
|
||||
("comment", _elementFactory(doc, c_node)))
|
||||
c_node = _nextElement(c_node)
|
||||
elif c_node.type == tree.XML_PI_NODE:
|
||||
if self._event_filter & PARSE_EVENT_FILTER_PI:
|
||||
self._events.append(
|
||||
("pi", _elementFactory(doc, c_node)))
|
||||
c_node = _nextElement(c_node)
|
||||
else:
|
||||
break
|
||||
return c_node
|
||||
|
||||
@cython.final
|
||||
cdef _next_event(self):
|
||||
if self._skip_state == IWSKIP_NEXT_IS_START:
|
||||
if self._events[0][0] in ('start', 'start-ns'):
|
||||
self._skip_state = IWSKIP_CAN_SKIP
|
||||
return self._pop_event(0)
|
||||
|
||||
def skip_subtree(self):
|
||||
"""Prevent descending into the current subtree.
|
||||
Instead, the next returned event will be the 'end' event of the current element
|
||||
(if included), ignoring any children or descendants.
|
||||
|
||||
This has no effect right after an 'end' or 'end-ns' event.
|
||||
"""
|
||||
if self._skip_state == IWSKIP_CAN_SKIP:
|
||||
self._skip_state = IWSKIP_SKIP_NEXT
|
||||
|
||||
@cython.final
|
||||
cdef int _start_node(self, _Element node) except -1:
|
||||
cdef int ns_count
|
||||
if self._event_filter & PARSE_EVENT_FILTER_START_NS:
|
||||
ns_count = _appendStartNsEvents(node._c_node, self._events)
|
||||
if self._events:
|
||||
self._skip_state = IWSKIP_NEXT_IS_START
|
||||
elif self._event_filter & PARSE_EVENT_FILTER_END_NS:
|
||||
ns_count = _countNsDefs(node._c_node)
|
||||
else:
|
||||
ns_count = 0
|
||||
if self._event_filter & PARSE_EVENT_FILTER_START:
|
||||
if self._matcher is None or self._matcher.matches(node._c_node):
|
||||
self._events.append( ("start", node) )
|
||||
self._skip_state = IWSKIP_NEXT_IS_START
|
||||
return ns_count
|
||||
|
||||
@cython.final
|
||||
cdef _Element _end_node(self):
|
||||
cdef _Element node
|
||||
cdef int i, ns_count
|
||||
node, ns_count = self._node_stack.pop()
|
||||
if self._event_filter & PARSE_EVENT_FILTER_END:
|
||||
if self._matcher is None or self._matcher.matches(node._c_node):
|
||||
self._events.append( ("end", node) )
|
||||
if self._event_filter & PARSE_EVENT_FILTER_END_NS and ns_count:
|
||||
event = ("end-ns", None)
|
||||
for i in range(ns_count):
|
||||
self._events.append(event)
|
||||
return node
|
||||
|
||||
|
||||
cdef int _countNsDefs(xmlNode* c_node) noexcept:
|
||||
cdef xmlNs* c_ns
|
||||
cdef int count
|
||||
count = 0
|
||||
c_ns = c_node.nsDef
|
||||
while c_ns is not NULL:
|
||||
count += (c_ns.href is not NULL)
|
||||
c_ns = c_ns.next
|
||||
return count
|
||||
|
||||
|
||||
cdef int _appendStartNsEvents(xmlNode* c_node, list event_list) except -1:
|
||||
cdef xmlNs* c_ns
|
||||
cdef int count
|
||||
count = 0
|
||||
c_ns = c_node.nsDef
|
||||
while c_ns is not NULL:
|
||||
if c_ns.href:
|
||||
ns_tuple = (funicodeOrEmpty(c_ns.prefix),
|
||||
funicode(c_ns.href))
|
||||
event_list.append( ("start-ns", ns_tuple) )
|
||||
count += 1
|
||||
c_ns = c_ns.next
|
||||
return count
|
Reference in New Issue
Block a user