Do a better job of keeping track of namespaces as an XML document is

parsed, so that CSS selectors that use namespaces will do the right thing more often. [bug=1946243]
author: Leonard Richardson <leonardr@segfault.org> 2021-11-29 22:13:33 -0500
committer: Leonard Richardson <leonardr@segfault.org> 2021-11-29 22:13:33 -0500
commit: ad52722cc6b55ce414d395e9a0860cee57c0ab2d (patch)
tree: 8ff820b41d9ee5fb1f896629782270349cd8311b
parent: c005e9ba28b4eec3a5fab173b928609bc692dd51 (diff)
6 files changed, 141 insertions, 22 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 4470f64..af99990 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -22,6 +22,10 @@ Python 2 was revision 605.
 * Issue a warning when an HTML parser is used to parse a document that
   looks like XML but not XHTML. [bug=1939121]
 
+* Do a better job of keeping track of namespaces as an XML document is
+  parsed, so that CSS selectors that use namespaces will do the right
+  thing more often. [bug=1946243]
+
 * Some time ago, the misleadingly named "text" argument to find-type
   methods was renamed to the more accurate "string." But this supposed
   "renaming" didn't make it into important places like the method
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 49e05e7..ddf1a86 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -309,8 +309,6 @@ class BeautifulSoup(Tag):
         self._namespaces = dict()
         self.parse_only = parse_only
 
-        self.builder.initialize_soup(self)
-
         if hasattr(markup, 'read'):        # It's a file-type object.
             markup = markup.read()
         elif len(markup) <= 256 and (
@@ -362,6 +360,7 @@ class BeautifulSoup(Tag):
              self.builder.prepare_markup(
                  markup, from_encoding, exclude_encodings=exclude_encodings)):
             self.reset()
+            self.builder.initialize_soup(self)
             try:
                 self._feed()
                 success = True
@@ -400,7 +399,7 @@ class BeautifulSoup(Tag):
         if 'builder' in d and not self.builder.picklable:
             d['builder'] = None
         return d
-
+    
     @classmethod
     def _decode_markup(cls, markup):
         """Ensure `markup` is bytes so it's safe to send into warnings.warn.
@@ -693,7 +692,7 @@ class BeautifulSoup(Tag):
         return most_recently_popped
 
     def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
-                        sourcepos=None):
+                        sourcepos=None, namespaces=None):
         """Called by the tree builder when a new tag is encountered.
 
         :param name: Name of the tag.
@@ -703,6 +702,8 @@ class BeautifulSoup(Tag):
             source document.
         :param sourcepos: The character position within `sourceline` where this
             tag was found.
+        :param namespaces: A dictionary of all namespace prefix mappings 
+            currently in scope in the document.
 
         If this method returns None, the tag was rejected by an active
         SoupStrainer. You should proceed as if the tag had not occurred
@@ -720,7 +721,8 @@ class BeautifulSoup(Tag):
         tag = self.element_classes.get(Tag, Tag)(
             self, self.builder, name, namespace, nsprefix, attrs,
             self.currentTag, self._most_recent_element,
-            sourceline=sourceline, sourcepos=sourcepos
+            sourceline=sourceline, sourcepos=sourcepos,
+            namespaces=namespaces
         )
         if tag is None:
             return tag
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index d8251b2..971c81e 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -80,15 +80,24 @@ class LXMLTreeBuilderForXML(TreeBuilder):
 
         This might be useful later on when creating CSS selectors.
 
+        This will track (almost) all namespaces, even ones that were
+        only in scope for part of the document. If two namespaces have
+        the same prefix, only the first one encountered will be
+        tracked. Un-prefixed namespaces are not tracked.
+
         :param mapping: A dictionary mapping namespace prefixes to URIs.
         """
         for key, value in list(mapping.items()):
+            # This is 'if key' and not 'if key is not None' because we
+            # don't track un-prefixed namespaces. Soupselect will
+            # treat an un-prefixed namespace as the default, which
+            # causes confusion in some cases.
             if key and key not in self.soup._namespaces:
                 # Let the BeautifulSoup object know about a new namespace.
                 # If there are multiple namespaces defined with the same
                 # prefix, the first one in the document takes precedence.
                 self.soup._namespaces[key] = value
-
+                
     def default_parser(self, encoding):
         """Find the default parser for the given encoding.
 
@@ -126,6 +135,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
             self.empty_element_tags = set(empty_element_tags)
         self.soup = None
         self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
+        self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
         super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
         
     def _getNsTag(self, tag):
@@ -250,6 +260,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
             # mappings.
             self.nsmaps.append(_invert(nsmap))
 
+            # The currently active namespace prefixes have
+            # changed. Calculate the new mapping so it can be stored
+            # with all Tag objects created while these prefixes are in
+            # scope.
+            current_mapping = dict(self.active_namespace_prefixes[-1])
+            current_mapping.update(nsmap)
+
+            # We should not track un-prefixed namespaces as we can only hold one
+            # and it will be recognized as the default namespace by soupsieve,
+            # which may be confusing in some situations.
+            if '' in current_mapping:
+                del current_mapping['']
+            self.active_namespace_prefixes.append(current_mapping)
+            
             # Also treat the namespace mapping as a set of attributes on the
             # tag, so we can recreate it later.
             attrs = attrs.copy()
@@ -274,8 +298,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
 
         namespace, name = self._getNsTag(name)
         nsprefix = self._prefix_for_namespace(namespace)
-        self.soup.handle_starttag(name, namespace, nsprefix, attrs)
-      
+        self.soup.handle_starttag(
+            name, namespace, nsprefix, attrs,
+            namespaces=self.active_namespace_prefixes[-1]
+        )
+        
     def _prefix_for_namespace(self, namespace):
         """Find the currently active prefix for the given namespace."""
         if namespace is None:
@@ -299,8 +326,14 @@ class LXMLTreeBuilderForXML(TreeBuilder):
         if len(self.nsmaps) > 1:
             # This tag, or one of its parents, introduced a namespace
             # mapping, so pop it off the stack.
-            self.nsmaps.pop()
-
+            out_of_scope_nsmap = self.nsmaps.pop()
+
+            if out_of_scope_nsmap is not None:
+                # This tag introduced a namespace mapping which is no
+                # longer in scope. Recalculate the currently active
+                # namespace prefixes.
+                self.active_namespace_prefixes.pop()
+            
     def pi(self, target, data):
         self.soup.endData()
         data = target + ' ' + data
diff --git a/bs4/element.py b/bs4/element.py
index 3383621..c6cb2eb 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1183,6 +1183,7 @@ class Tag(PageElement):
                  can_be_empty_element=None, cdata_list_attributes=None,
                  preserve_whitespace_tags=None,
                  interesting_string_types=None,
+                 namespaces=None
     ):
         """Basic constructor.
 
@@ -1215,6 +1216,9 @@ class Tag(PageElement):
             to be considered. The default is to consider
             NavigableString and CData the only interesting string
             subtypes.
+        :param namespaces: A dictionary mapping currently active
+            namespace prefixes to URIs. This can be used later to
+            construct CSS selectors.
         """
         if parser is None:
             self.parser_class = None
@@ -1226,6 +1230,7 @@ class Tag(PageElement):
             raise ValueError("No value provided for new tag's name.")
         self.name = name
         self.namespace = namespace
+        self._namespaces = namespaces or {}
         self.prefix = prefix
         if ((not builder or builder.store_line_numbers)
             and (sourceline is not None or sourcepos is not None)):
@@ -1308,7 +1313,7 @@ class Tag(PageElement):
         for child in self.contents:
             clone.append(child.__copy__())
         return clone
-
+    
     @property
     def is_empty_element(self):
         """Is this tag an empty-element tag? (aka a self-closing tag)
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index be954db..41319d1 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -95,18 +95,93 @@ class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
         return LXMLTreeBuilderForXML
 
     def test_namespace_indexing(self):
-        # We should not track un-prefixed namespaces as we can only hold one
-        # and it will be recognized as the default namespace by soupsieve,
-        # which may be confusing in some situations. When no namespace is provided
-        # for a selector, the default namespace (if defined) is assumed.
-
         soup = self.soup(
             '<?xml version="1.1"?>\n'
             '<root>'
             '<tag xmlns="http://unprefixed-namespace.com">content</tag>'
-            '<prefix:tag xmlns:prefix="http://prefixed-namespace.com">content</tag>'
+            '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</prefix:tag2>'
+            '<prefix2:tag3 xmlns:prefix2="http://another-namespace.com">'
+            '<subtag xmlns="http://another-unprefixed-namespace.com">'
+            '<subsubtag xmlns="http://yet-another-unprefixed-namespace.com">'
+            '</prefix2:tag3>'
             '</root>'
         )
-        assert soup._namespaces == (
-            {'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'}
+
+        # The BeautifulSoup object includes every namespace prefix
+        # defined in the entire document. This is the default set of
+        # namespaces used by soupsieve.
+        #
+        # Un-prefixed namespaces are not included, and if a given
+        # prefix is defined twice, only the first prefix encountered
+        # in the document shows up here.
+        assert soup._namespaces == {
+            'xml': 'http://www.w3.org/XML/1998/namespace',
+            'prefix': 'http://prefixed-namespace.com',
+            'prefix2': 'http://another-namespace.com'
+        }
+
+        # A Tag object includes only the namespace prefixes
+        # that were in scope when it was parsed.
+
+        # We do not track un-prefixed namespaces as we can only hold
+        # one (the first one), and it will be recognized as the
+        # default namespace by soupsieve, even when operating from a
+        # tag with a different un-prefixed namespace.
+        assert soup.tag._namespaces == {
+            'xml': 'http://www.w3.org/XML/1998/namespace',
+        }
+
+        assert soup.tag2._namespaces == {
+            'prefix': 'http://prefixed-namespace.com',
+            'xml': 'http://www.w3.org/XML/1998/namespace',
+        }
+
+        assert soup.subtag._namespaces == {
+            'prefix2': 'http://another-namespace.com',
+            'xml': 'http://www.w3.org/XML/1998/namespace',
+        }
+
+        assert soup.subsubtag._namespaces == {
+            'prefix2': 'http://another-namespace.com',
+            'xml': 'http://www.w3.org/XML/1998/namespace',
+        }
+
+
+    def test_namespace_interaction_with_select_and_find(self):
+        # Demonstrate how namespaces interact with select* and
+        # find* methods.
+        
+        soup = self.soup(
+            '<?xml version="1.1"?>\n'
+            '<root>'
+            '<tag xmlns="http://unprefixed-namespace.com">content</tag>'
+            '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</tag>'
+            '<subtag xmlns:prefix="http://another-namespace-same-prefix.com">'
+             '<prefix:tag3>'
+            '</subtag>'
+            '</root>'
         )
+
+        # soupselect uses namespace URIs.
+        assert soup.select_one('tag').name == 'tag'
+        assert soup.select_one('prefix|tag2').name == 'tag2'
+
+        # If a prefix is declared more than once, only the first usage
+        # is registered with the BeautifulSoup object.
+        assert soup.select_one('prefix|tag3') is None
+
+        # But you can always explicitly specify a namespace dictionary.
+        assert soup.select_one(
+            'prefix|tag3', namespaces=soup.subtag._namespaces
+        ).name == 'tag3'
+
+        # And a Tag (as opposed to the BeautifulSoup object) will
+        # have a set of default namespaces scoped to that Tag.
+        assert soup.subtag.select_one('prefix|tag3').name=='tag3'
+
+        # the find() methods aren't fully namespace-aware; they just
+        # look at prefixes.
+        assert soup.find('tag').name == 'tag'
+        assert soup.find('prefix:tag2').name == 'tag2'
+        assert soup.find('prefix:tag3').name == 'tag3'
+        assert soup.subtag.find('prefix:tag3').name == 'tag3'
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 67d3648..d81fccd 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -1800,9 +1800,9 @@ selectors.::
  # [<ns1:child>I'm in namespace 1</ns1:child>]
  
 When handling a CSS selector that uses namespaces, Beautiful Soup
-uses the namespace abbreviations it found when parsing the
-document. You can override this by passing in your own dictionary of
-abbreviations::
+always tries to use namespace prefixes that make sense based on what
+it saw while parsing the document. You can always provide your own
+dictionary of abbreviations::
 
  namespaces = dict(first="http://namespace1/", second="http://namespace2/")
  soup.select("second|child", namespaces=namespaces)
author	Leonard Richardson <leonardr@segfault.org>	2021-11-29 22:13:33 -0500
committer	Leonard Richardson <leonardr@segfault.org>	2021-11-29 22:13:33 -0500
commit	ad52722cc6b55ce414d395e9a0860cee57c0ab2d (patch)
tree	8ff820b41d9ee5fb1f896629782270349cd8311b
parent	c005e9ba28b4eec3a5fab173b928609bc692dd51 (diff)