week06

2024-12-09 18:22:38 +09:00
parent ab0cbebefc
commit c4c4547706
959 changed files with 174888 additions and 6 deletions
--- a/env/lib/python3.12/site-packages/lxml/cleanup.pxi
+++ b/env/lib/python3.12/site-packages/lxml/cleanup.pxi
@ -0,0 +1,215 @@
+# functions for tree cleanup and removing elements from subtrees
+
+def cleanup_namespaces(tree_or_element, top_nsmap=None, keep_ns_prefixes=None):
+    """cleanup_namespaces(tree_or_element, top_nsmap=None, keep_ns_prefixes=None)
+
+    Remove all namespace declarations from a subtree that are not used
+    by any of the elements or attributes in that tree.
+
+    If a 'top_nsmap' is provided, it must be a mapping from prefixes
+    to namespace URIs.  These namespaces will be declared on the top
+    element of the subtree before running the cleanup, which allows
+    moving namespace declarations to the top of the tree.
+
+    If a 'keep_ns_prefixes' is provided, it must be a list of prefixes.
+    These prefixes will not be removed as part of the cleanup.
+    """
+    element = _rootNodeOrRaise(tree_or_element)
+    c_element = element._c_node
+
+    if top_nsmap:
+        doc = element._doc
+        # declare namespaces from nsmap, then apply them to the subtree
+        _setNodeNamespaces(c_element, doc, None, top_nsmap)
+        moveNodeToDocument(doc, c_element.doc, c_element)
+
+    keep_ns_prefixes = (
+        set([_utf8(prefix) for prefix in keep_ns_prefixes])
+        if keep_ns_prefixes else None)
+
+    _removeUnusedNamespaceDeclarations(c_element, keep_ns_prefixes)
+
+
+def strip_attributes(tree_or_element, *attribute_names):
+    """strip_attributes(tree_or_element, *attribute_names)
+
+    Delete all attributes with the provided attribute names from an
+    Element (or ElementTree) and its descendants.
+
+    Attribute names can contain wildcards as in `_Element.iter`.
+
+    Example usage::
+
+        strip_attributes(root_element,
+                         'simpleattr',
+                         '{http://some/ns}attrname',
+                         '{http://other/ns}*')
+    """
+    cdef _MultiTagMatcher matcher
+    element = _rootNodeOrRaise(tree_or_element)
+    if not attribute_names:
+        return
+
+    matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, attribute_names)
+    matcher.cacheTags(element._doc)
+    if matcher.rejectsAllAttributes():
+        return
+    _strip_attributes(element._c_node, matcher)
+
+
+cdef _strip_attributes(xmlNode* c_node, _MultiTagMatcher matcher):
+    cdef xmlAttr* c_attr
+    cdef xmlAttr* c_next_attr
+    tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
+    if c_node.type == tree.XML_ELEMENT_NODE:
+        c_attr = c_node.properties
+        while c_attr is not NULL:
+            c_next_attr = c_attr.next
+            if matcher.matchesAttribute(c_attr):
+                tree.xmlRemoveProp(c_attr)
+            c_attr = c_next_attr
+    tree.END_FOR_EACH_ELEMENT_FROM(c_node)
+
+
+def strip_elements(tree_or_element, *tag_names, bint with_tail=True):
+    """strip_elements(tree_or_element, *tag_names, with_tail=True)
+
+    Delete all elements with the provided tag names from a tree or
+    subtree.  This will remove the elements and their entire subtree,
+    including all their attributes, text content and descendants.  It
+    will also remove the tail text of the element unless you
+    explicitly set the ``with_tail`` keyword argument option to False.
+
+    Tag names can contain wildcards as in `_Element.iter`.
+
+    Note that this will not delete the element (or ElementTree root
+    element) that you passed even if it matches.  It will only treat
+    its descendants.  If you want to include the root element, check
+    its tag name directly before even calling this function.
+
+    Example usage::
+
+        strip_elements(some_element,
+            'simpletagname',             # non-namespaced tag
+            '{http://some/ns}tagname',   # namespaced tag
+            '{http://some/other/ns}*'    # any tag from a namespace
+            lxml.etree.Comment           # comments
+            )
+    """
+    cdef _MultiTagMatcher matcher
+    doc = _documentOrRaise(tree_or_element)
+    element = _rootNodeOrRaise(tree_or_element)
+    if not tag_names:
+        return
+
+    matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag_names)
+    matcher.cacheTags(doc)
+    if matcher.rejectsAll():
+        return
+
+    if isinstance(tree_or_element, _ElementTree):
+        # include PIs and comments next to the root node
+        if matcher.matchesType(tree.XML_COMMENT_NODE):
+            _removeSiblings(element._c_node, tree.XML_COMMENT_NODE, with_tail)
+        if matcher.matchesType(tree.XML_PI_NODE):
+            _removeSiblings(element._c_node, tree.XML_PI_NODE, with_tail)
+    _strip_elements(doc, element._c_node, matcher, with_tail)
+
+cdef _strip_elements(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher,
+                     bint with_tail):
+    cdef xmlNode* c_child
+    cdef xmlNode* c_next
+
+    tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
+    if c_node.type == tree.XML_ELEMENT_NODE:
+        # we run through the children here to prevent any problems
+        # with the tree iteration which would occur if we unlinked the
+        # c_node itself
+        c_child = _findChildForwards(c_node, 0)
+        while c_child is not NULL:
+            c_next = _nextElement(c_child)
+            if matcher.matches(c_child):
+                if c_child.type == tree.XML_ELEMENT_NODE:
+                    if not with_tail:
+                        tree.xmlUnlinkNode(c_child)
+                    _removeNode(doc, c_child)
+                else:
+                    if with_tail:
+                        _removeText(c_child.next)
+                    tree.xmlUnlinkNode(c_child)
+                    attemptDeallocation(c_child)
+            c_child = c_next
+    tree.END_FOR_EACH_ELEMENT_FROM(c_node)
+
+
+def strip_tags(tree_or_element, *tag_names):
+    """strip_tags(tree_or_element, *tag_names)
+
+    Delete all elements with the provided tag names from a tree or
+    subtree.  This will remove the elements and their attributes, but
+    *not* their text/tail content or descendants.  Instead, it will
+    merge the text content and children of the element into its
+    parent.
+
+    Tag names can contain wildcards as in `_Element.iter`.
+
+    Note that this will not delete the element (or ElementTree root
+    element) that you passed even if it matches.  It will only treat
+    its descendants.
+
+    Example usage::
+
+        strip_tags(some_element,
+            'simpletagname',             # non-namespaced tag
+            '{http://some/ns}tagname',   # namespaced tag
+            '{http://some/other/ns}*'    # any tag from a namespace
+            Comment                      # comments (including their text!)
+            )
+    """
+    cdef _MultiTagMatcher matcher
+    doc = _documentOrRaise(tree_or_element)
+    element = _rootNodeOrRaise(tree_or_element)
+    if not tag_names:
+        return
+
+    matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag_names)
+    matcher.cacheTags(doc)
+    if matcher.rejectsAll():
+        return
+
+    if isinstance(tree_or_element, _ElementTree):
+        # include PIs and comments next to the root node
+        if matcher.matchesType(tree.XML_COMMENT_NODE):
+            _removeSiblings(element._c_node, tree.XML_COMMENT_NODE, 0)
+        if matcher.matchesType(tree.XML_PI_NODE):
+            _removeSiblings(element._c_node, tree.XML_PI_NODE, 0)
+    _strip_tags(doc, element._c_node, matcher)
+
+cdef _strip_tags(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher):
+    cdef xmlNode* c_child
+    cdef xmlNode* c_next
+
+    tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
+    if c_node.type == tree.XML_ELEMENT_NODE:
+        # we run through the children here to prevent any problems
+        # with the tree iteration which would occur if we unlinked the
+        # c_node itself
+        c_child = _findChildForwards(c_node, 0)
+        while c_child is not NULL:
+            if not matcher.matches(c_child):
+                c_child = _nextElement(c_child)
+                continue
+            if c_child.type == tree.XML_ELEMENT_NODE:
+                c_next = _findChildForwards(c_child, 0) or _nextElement(c_child)
+                _replaceNodeByChildren(doc, c_child)
+                if not attemptDeallocation(c_child):
+                    if c_child.nsDef is not NULL:
+                        # make namespaces absolute
+                        moveNodeToDocument(doc, doc._c_doc, c_child)
+                c_child = c_next
+            else:
+                c_next = _nextElement(c_child)
+                tree.xmlUnlinkNode(c_child)
+                attemptDeallocation(c_child)
+                c_child = c_next
+    tree.END_FOR_EACH_ELEMENT_FROM(c_node)