diff options
author | Leonard Richardson <leonardr@segfault.org> | 2021-11-29 22:13:33 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2021-11-29 22:13:33 -0500 |
commit | ad52722cc6b55ce414d395e9a0860cee57c0ab2d (patch) | |
tree | 8ff820b41d9ee5fb1f896629782270349cd8311b | |
parent | c005e9ba28b4eec3a5fab173b928609bc692dd51 (diff) |
Do a better job of keeping track of namespaces as an XML document is
parsed, so that CSS selectors that use namespaces will do the right
thing more often. [bug=1946243]
-rw-r--r-- | CHANGELOG | 4 | ||||
-rw-r--r-- | bs4/__init__.py | 12 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 43 | ||||
-rw-r--r-- | bs4/element.py | 7 | ||||
-rw-r--r-- | bs4/tests/test_lxml.py | 91 | ||||
-rw-r--r-- | doc/source/index.rst | 6 |
6 files changed, 141 insertions, 22 deletions
@@ -22,6 +22,10 @@ Python 2 was revision 605. * Issue a warning when an HTML parser is used to parse a document that looks like XML but not XHTML. [bug=1939121] +* Do a better job of keeping track of namespaces as an XML document is + parsed, so that CSS selectors that use namespaces will do the right + thing more often. [bug=1946243] + * Some time ago, the misleadingly named "text" argument to find-type methods was renamed to the more accurate "string." But this supposed "renaming" didn't make it into important places like the method diff --git a/bs4/__init__.py b/bs4/__init__.py index 49e05e7..ddf1a86 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -309,8 +309,6 @@ class BeautifulSoup(Tag): self._namespaces = dict() self.parse_only = parse_only - self.builder.initialize_soup(self) - if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() elif len(markup) <= 256 and ( @@ -362,6 +360,7 @@ class BeautifulSoup(Tag): self.builder.prepare_markup( markup, from_encoding, exclude_encodings=exclude_encodings)): self.reset() + self.builder.initialize_soup(self) try: self._feed() success = True @@ -400,7 +399,7 @@ class BeautifulSoup(Tag): if 'builder' in d and not self.builder.picklable: d['builder'] = None return d - + @classmethod def _decode_markup(cls, markup): """Ensure `markup` is bytes so it's safe to send into warnings.warn. @@ -693,7 +692,7 @@ class BeautifulSoup(Tag): return most_recently_popped def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, - sourcepos=None): + sourcepos=None, namespaces=None): """Called by the tree builder when a new tag is encountered. :param name: Name of the tag. @@ -703,6 +702,8 @@ class BeautifulSoup(Tag): source document. :param sourcepos: The character position within `sourceline` where this tag was found. + :param namespaces: A dictionary of all namespace prefix mappings + currently in scope in the document. If this method returns None, the tag was rejected by an active SoupStrainer. You should proceed as if the tag had not occurred @@ -720,7 +721,8 @@ class BeautifulSoup(Tag): tag = self.element_classes.get(Tag, Tag)( self, self.builder, name, namespace, nsprefix, attrs, self.currentTag, self._most_recent_element, - sourceline=sourceline, sourcepos=sourcepos + sourceline=sourceline, sourcepos=sourcepos, + namespaces=namespaces ) if tag is None: return tag diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index d8251b2..971c81e 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -80,15 +80,24 @@ class LXMLTreeBuilderForXML(TreeBuilder): This might be useful later on when creating CSS selectors. + This will track (almost) all namespaces, even ones that were + only in scope for part of the document. If two namespaces have + the same prefix, only the first one encountered will be + tracked. Un-prefixed namespaces are not tracked. + :param mapping: A dictionary mapping namespace prefixes to URIs. """ for key, value in list(mapping.items()): + # This is 'if key' and not 'if key is not None' because we + # don't track un-prefixed namespaces. Soupselect will + # treat an un-prefixed namespace as the default, which + # causes confusion in some cases. if key and key not in self.soup._namespaces: # Let the BeautifulSoup object know about a new namespace. # If there are multiple namespaces defined with the same # prefix, the first one in the document takes precedence. self.soup._namespaces[key] = value - + def default_parser(self, encoding): """Find the default parser for the given encoding. @@ -126,6 +135,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.empty_element_tags = set(empty_element_tags) self.soup = None self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)] super(LXMLTreeBuilderForXML, self).__init__(**kwargs) def _getNsTag(self, tag): @@ -250,6 +260,20 @@ class LXMLTreeBuilderForXML(TreeBuilder): # mappings. self.nsmaps.append(_invert(nsmap)) + # The currently active namespace prefixes have + # changed. Calculate the new mapping so it can be stored + # with all Tag objects created while these prefixes are in + # scope. + current_mapping = dict(self.active_namespace_prefixes[-1]) + current_mapping.update(nsmap) + + # We should not track un-prefixed namespaces as we can only hold one + # and it will be recognized as the default namespace by soupsieve, + # which may be confusing in some situations. + if '' in current_mapping: + del current_mapping[''] + self.active_namespace_prefixes.append(current_mapping) + # Also treat the namespace mapping as a set of attributes on the # tag, so we can recreate it later. attrs = attrs.copy() @@ -274,8 +298,11 @@ class LXMLTreeBuilderForXML(TreeBuilder): namespace, name = self._getNsTag(name) nsprefix = self._prefix_for_namespace(namespace) - self.soup.handle_starttag(name, namespace, nsprefix, attrs) - + self.soup.handle_starttag( + name, namespace, nsprefix, attrs, + namespaces=self.active_namespace_prefixes[-1] + ) + def _prefix_for_namespace(self, namespace): """Find the currently active prefix for the given namespace.""" if namespace is None: @@ -299,8 +326,14 @@ class LXMLTreeBuilderForXML(TreeBuilder): if len(self.nsmaps) > 1: # This tag, or one of its parents, introduced a namespace # mapping, so pop it off the stack. - self.nsmaps.pop() - + out_of_scope_nsmap = self.nsmaps.pop() + + if out_of_scope_nsmap is not None: + # This tag introduced a namespace mapping which is no + # longer in scope. Recalculate the currently active + # namespace prefixes. + self.active_namespace_prefixes.pop() + def pi(self, target, data): self.soup.endData() data = target + ' ' + data diff --git a/bs4/element.py b/bs4/element.py index 3383621..c6cb2eb 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1183,6 +1183,7 @@ class Tag(PageElement): can_be_empty_element=None, cdata_list_attributes=None, preserve_whitespace_tags=None, interesting_string_types=None, + namespaces=None ): """Basic constructor. @@ -1215,6 +1216,9 @@ class Tag(PageElement): to be considered. The default is to consider NavigableString and CData the only interesting string subtypes. + :param namespaces: A dictionary mapping currently active + namespace prefixes to URIs. This can be used later to + construct CSS selectors. """ if parser is None: self.parser_class = None @@ -1226,6 +1230,7 @@ class Tag(PageElement): raise ValueError("No value provided for new tag's name.") self.name = name self.namespace = namespace + self._namespaces = namespaces or {} self.prefix = prefix if ((not builder or builder.store_line_numbers) and (sourceline is not None or sourcepos is not None)): @@ -1308,7 +1313,7 @@ class Tag(PageElement): for child in self.contents: clone.append(child.__copy__()) return clone - + @property def is_empty_element(self): """Is this tag an empty-element tag? (aka a self-closing tag) diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index be954db..41319d1 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -95,18 +95,93 @@ class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest): return LXMLTreeBuilderForXML def test_namespace_indexing(self): - # We should not track un-prefixed namespaces as we can only hold one - # and it will be recognized as the default namespace by soupsieve, - # which may be confusing in some situations. When no namespace is provided - # for a selector, the default namespace (if defined) is assumed. - soup = self.soup( '<?xml version="1.1"?>\n' '<root>' '<tag xmlns="http://unprefixed-namespace.com">content</tag>' - '<prefix:tag xmlns:prefix="http://prefixed-namespace.com">content</tag>' + '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</prefix:tag2>' + '<prefix2:tag3 xmlns:prefix2="http://another-namespace.com">' + '<subtag xmlns="http://another-unprefixed-namespace.com">' + '<subsubtag xmlns="http://yet-another-unprefixed-namespace.com">' + '</prefix2:tag3>' '</root>' ) - assert soup._namespaces == ( - {'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'} + + # The BeautifulSoup object includes every namespace prefix + # defined in the entire document. This is the default set of + # namespaces used by soupsieve. + # + # Un-prefixed namespaces are not included, and if a given + # prefix is defined twice, only the first prefix encountered + # in the document shows up here. + assert soup._namespaces == { + 'xml': 'http://www.w3.org/XML/1998/namespace', + 'prefix': 'http://prefixed-namespace.com', + 'prefix2': 'http://another-namespace.com' + } + + # A Tag object includes only the namespace prefixes + # that were in scope when it was parsed. + + # We do not track un-prefixed namespaces as we can only hold + # one (the first one), and it will be recognized as the + # default namespace by soupsieve, even when operating from a + # tag with a different un-prefixed namespace. + assert soup.tag._namespaces == { + 'xml': 'http://www.w3.org/XML/1998/namespace', + } + + assert soup.tag2._namespaces == { + 'prefix': 'http://prefixed-namespace.com', + 'xml': 'http://www.w3.org/XML/1998/namespace', + } + + assert soup.subtag._namespaces == { + 'prefix2': 'http://another-namespace.com', + 'xml': 'http://www.w3.org/XML/1998/namespace', + } + + assert soup.subsubtag._namespaces == { + 'prefix2': 'http://another-namespace.com', + 'xml': 'http://www.w3.org/XML/1998/namespace', + } + + + def test_namespace_interaction_with_select_and_find(self): + # Demonstrate how namespaces interact with select* and + # find* methods. + + soup = self.soup( + '<?xml version="1.1"?>\n' + '<root>' + '<tag xmlns="http://unprefixed-namespace.com">content</tag>' + '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</tag>' + '<subtag xmlns:prefix="http://another-namespace-same-prefix.com">' + '<prefix:tag3>' + '</subtag>' + '</root>' ) + + # soupselect uses namespace URIs. + assert soup.select_one('tag').name == 'tag' + assert soup.select_one('prefix|tag2').name == 'tag2' + + # If a prefix is declared more than once, only the first usage + # is registered with the BeautifulSoup object. + assert soup.select_one('prefix|tag3') is None + + # But you can always explicitly specify a namespace dictionary. + assert soup.select_one( + 'prefix|tag3', namespaces=soup.subtag._namespaces + ).name == 'tag3' + + # And a Tag (as opposed to the BeautifulSoup object) will + # have a set of default namespaces scoped to that Tag. + assert soup.subtag.select_one('prefix|tag3').name=='tag3' + + # the find() methods aren't fully namespace-aware; they just + # look at prefixes. + assert soup.find('tag').name == 'tag' + assert soup.find('prefix:tag2').name == 'tag2' + assert soup.find('prefix:tag3').name == 'tag3' + assert soup.subtag.find('prefix:tag3').name == 'tag3' diff --git a/doc/source/index.rst b/doc/source/index.rst index 67d3648..d81fccd 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -1800,9 +1800,9 @@ selectors.:: # [<ns1:child>I'm in namespace 1</ns1:child>] When handling a CSS selector that uses namespaces, Beautiful Soup -uses the namespace abbreviations it found when parsing the -document. You can override this by passing in your own dictionary of -abbreviations:: +always tries to use namespace prefixes that make sense based on what +it saw while parsing the document. You can always provide your own +dictionary of abbreviations:: namespaces = dict(first="http://namespace1/", second="http://namespace2/") soup.select("second|child", namespaces=namespaces) |