summaryrefslogtreecommitdiff
path: root/bs4/builder/_lxml.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2021-11-29 22:13:33 -0500
committerLeonard Richardson <leonardr@segfault.org>2021-11-29 22:13:33 -0500
commitad52722cc6b55ce414d395e9a0860cee57c0ab2d (patch)
tree8ff820b41d9ee5fb1f896629782270349cd8311b /bs4/builder/_lxml.py
parentc005e9ba28b4eec3a5fab173b928609bc692dd51 (diff)
Do a better job of keeping track of namespaces as an XML document is
parsed, so that CSS selectors that use namespaces will do the right thing more often. [bug=1946243]
Diffstat (limited to 'bs4/builder/_lxml.py')
-rw-r--r--bs4/builder/_lxml.py43
1 files changed, 38 insertions, 5 deletions
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index d8251b2..971c81e 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -80,15 +80,24 @@ class LXMLTreeBuilderForXML(TreeBuilder):
This might be useful later on when creating CSS selectors.
+ This will track (almost) all namespaces, even ones that were
+ only in scope for part of the document. If two namespaces have
+ the same prefix, only the first one encountered will be
+ tracked. Un-prefixed namespaces are not tracked.
+
:param mapping: A dictionary mapping namespace prefixes to URIs.
"""
for key, value in list(mapping.items()):
+ # This is 'if key' and not 'if key is not None' because we
+ # don't track un-prefixed namespaces. Soupselect will
+ # treat an un-prefixed namespace as the default, which
+ # causes confusion in some cases.
if key and key not in self.soup._namespaces:
# Let the BeautifulSoup object know about a new namespace.
# If there are multiple namespaces defined with the same
# prefix, the first one in the document takes precedence.
self.soup._namespaces[key] = value
-
+
def default_parser(self, encoding):
"""Find the default parser for the given encoding.
@@ -126,6 +135,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.empty_element_tags = set(empty_element_tags)
self.soup = None
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
+ self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
def _getNsTag(self, tag):
@@ -250,6 +260,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# mappings.
self.nsmaps.append(_invert(nsmap))
+ # The currently active namespace prefixes have
+ # changed. Calculate the new mapping so it can be stored
+ # with all Tag objects created while these prefixes are in
+ # scope.
+ current_mapping = dict(self.active_namespace_prefixes[-1])
+ current_mapping.update(nsmap)
+
+ # We should not track un-prefixed namespaces as we can only hold one
+ # and it will be recognized as the default namespace by soupsieve,
+ # which may be confusing in some situations.
+ if '' in current_mapping:
+ del current_mapping['']
+ self.active_namespace_prefixes.append(current_mapping)
+
# Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later.
attrs = attrs.copy()
@@ -274,8 +298,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
namespace, name = self._getNsTag(name)
nsprefix = self._prefix_for_namespace(namespace)
- self.soup.handle_starttag(name, namespace, nsprefix, attrs)
-
+ self.soup.handle_starttag(
+ name, namespace, nsprefix, attrs,
+ namespaces=self.active_namespace_prefixes[-1]
+ )
+
def _prefix_for_namespace(self, namespace):
"""Find the currently active prefix for the given namespace."""
if namespace is None:
@@ -299,8 +326,14 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if len(self.nsmaps) > 1:
# This tag, or one of its parents, introduced a namespace
# mapping, so pop it off the stack.
- self.nsmaps.pop()
-
+ out_of_scope_nsmap = self.nsmaps.pop()
+
+ if out_of_scope_nsmap is not None:
+ # This tag introduced a namespace mapping which is no
+ # longer in scope. Recalculate the currently active
+ # namespace prefixes.
+ self.active_namespace_prefixes.pop()
+
def pi(self, target, data):
self.soup.endData()
data = target + ' ' + data