summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2021-11-29 22:13:33 -0500
committerLeonard Richardson <leonardr@segfault.org>2021-11-29 22:13:33 -0500
commitad52722cc6b55ce414d395e9a0860cee57c0ab2d (patch)
tree8ff820b41d9ee5fb1f896629782270349cd8311b
parentc005e9ba28b4eec3a5fab173b928609bc692dd51 (diff)
Do a better job of keeping track of namespaces as an XML document is
parsed, so that CSS selectors that use namespaces will do the right thing more often. [bug=1946243]
-rw-r--r--CHANGELOG4
-rw-r--r--bs4/__init__.py12
-rw-r--r--bs4/builder/_lxml.py43
-rw-r--r--bs4/element.py7
-rw-r--r--bs4/tests/test_lxml.py91
-rw-r--r--doc/source/index.rst6
6 files changed, 141 insertions, 22 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 4470f64..af99990 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -22,6 +22,10 @@ Python 2 was revision 605.
* Issue a warning when an HTML parser is used to parse a document that
looks like XML but not XHTML. [bug=1939121]
+* Do a better job of keeping track of namespaces as an XML document is
+ parsed, so that CSS selectors that use namespaces will do the right
+ thing more often. [bug=1946243]
+
* Some time ago, the misleadingly named "text" argument to find-type
methods was renamed to the more accurate "string." But this supposed
"renaming" didn't make it into important places like the method
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 49e05e7..ddf1a86 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -309,8 +309,6 @@ class BeautifulSoup(Tag):
self._namespaces = dict()
self.parse_only = parse_only
- self.builder.initialize_soup(self)
-
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
elif len(markup) <= 256 and (
@@ -362,6 +360,7 @@ class BeautifulSoup(Tag):
self.builder.prepare_markup(
markup, from_encoding, exclude_encodings=exclude_encodings)):
self.reset()
+ self.builder.initialize_soup(self)
try:
self._feed()
success = True
@@ -400,7 +399,7 @@ class BeautifulSoup(Tag):
if 'builder' in d and not self.builder.picklable:
d['builder'] = None
return d
-
+
@classmethod
def _decode_markup(cls, markup):
"""Ensure `markup` is bytes so it's safe to send into warnings.warn.
@@ -693,7 +692,7 @@ class BeautifulSoup(Tag):
return most_recently_popped
def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
- sourcepos=None):
+ sourcepos=None, namespaces=None):
"""Called by the tree builder when a new tag is encountered.
:param name: Name of the tag.
@@ -703,6 +702,8 @@ class BeautifulSoup(Tag):
source document.
:param sourcepos: The character position within `sourceline` where this
tag was found.
+ :param namespaces: A dictionary of all namespace prefix mappings
+ currently in scope in the document.
If this method returns None, the tag was rejected by an active
SoupStrainer. You should proceed as if the tag had not occurred
@@ -720,7 +721,8 @@ class BeautifulSoup(Tag):
tag = self.element_classes.get(Tag, Tag)(
self, self.builder, name, namespace, nsprefix, attrs,
self.currentTag, self._most_recent_element,
- sourceline=sourceline, sourcepos=sourcepos
+ sourceline=sourceline, sourcepos=sourcepos,
+ namespaces=namespaces
)
if tag is None:
return tag
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index d8251b2..971c81e 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -80,15 +80,24 @@ class LXMLTreeBuilderForXML(TreeBuilder):
This might be useful later on when creating CSS selectors.
+ This will track (almost) all namespaces, even ones that were
+ only in scope for part of the document. If two namespaces have
+ the same prefix, only the first one encountered will be
+ tracked. Un-prefixed namespaces are not tracked.
+
:param mapping: A dictionary mapping namespace prefixes to URIs.
"""
for key, value in list(mapping.items()):
+ # This is 'if key' and not 'if key is not None' because we
+ # don't track un-prefixed namespaces. Soupselect will
+ # treat an un-prefixed namespace as the default, which
+ # causes confusion in some cases.
if key and key not in self.soup._namespaces:
# Let the BeautifulSoup object know about a new namespace.
# If there are multiple namespaces defined with the same
# prefix, the first one in the document takes precedence.
self.soup._namespaces[key] = value
-
+
def default_parser(self, encoding):
"""Find the default parser for the given encoding.
@@ -126,6 +135,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.empty_element_tags = set(empty_element_tags)
self.soup = None
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
+ self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
def _getNsTag(self, tag):
@@ -250,6 +260,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# mappings.
self.nsmaps.append(_invert(nsmap))
+ # The currently active namespace prefixes have
+ # changed. Calculate the new mapping so it can be stored
+ # with all Tag objects created while these prefixes are in
+ # scope.
+ current_mapping = dict(self.active_namespace_prefixes[-1])
+ current_mapping.update(nsmap)
+
+ # We should not track un-prefixed namespaces as we can only hold one
+ # and it will be recognized as the default namespace by soupsieve,
+ # which may be confusing in some situations.
+ if '' in current_mapping:
+ del current_mapping['']
+ self.active_namespace_prefixes.append(current_mapping)
+
# Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later.
attrs = attrs.copy()
@@ -274,8 +298,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
namespace, name = self._getNsTag(name)
nsprefix = self._prefix_for_namespace(namespace)
- self.soup.handle_starttag(name, namespace, nsprefix, attrs)
-
+ self.soup.handle_starttag(
+ name, namespace, nsprefix, attrs,
+ namespaces=self.active_namespace_prefixes[-1]
+ )
+
def _prefix_for_namespace(self, namespace):
"""Find the currently active prefix for the given namespace."""
if namespace is None:
@@ -299,8 +326,14 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if len(self.nsmaps) > 1:
# This tag, or one of its parents, introduced a namespace
# mapping, so pop it off the stack.
- self.nsmaps.pop()
-
+ out_of_scope_nsmap = self.nsmaps.pop()
+
+ if out_of_scope_nsmap is not None:
+ # This tag introduced a namespace mapping which is no
+ # longer in scope. Recalculate the currently active
+ # namespace prefixes.
+ self.active_namespace_prefixes.pop()
+
def pi(self, target, data):
self.soup.endData()
data = target + ' ' + data
diff --git a/bs4/element.py b/bs4/element.py
index 3383621..c6cb2eb 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1183,6 +1183,7 @@ class Tag(PageElement):
can_be_empty_element=None, cdata_list_attributes=None,
preserve_whitespace_tags=None,
interesting_string_types=None,
+ namespaces=None
):
"""Basic constructor.
@@ -1215,6 +1216,9 @@ class Tag(PageElement):
to be considered. The default is to consider
NavigableString and CData the only interesting string
subtypes.
+ :param namespaces: A dictionary mapping currently active
+ namespace prefixes to URIs. This can be used later to
+ construct CSS selectors.
"""
if parser is None:
self.parser_class = None
@@ -1226,6 +1230,7 @@ class Tag(PageElement):
raise ValueError("No value provided for new tag's name.")
self.name = name
self.namespace = namespace
+ self._namespaces = namespaces or {}
self.prefix = prefix
if ((not builder or builder.store_line_numbers)
and (sourceline is not None or sourcepos is not None)):
@@ -1308,7 +1313,7 @@ class Tag(PageElement):
for child in self.contents:
clone.append(child.__copy__())
return clone
-
+
@property
def is_empty_element(self):
"""Is this tag an empty-element tag? (aka a self-closing tag)
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index be954db..41319d1 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -95,18 +95,93 @@ class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
return LXMLTreeBuilderForXML
def test_namespace_indexing(self):
- # We should not track un-prefixed namespaces as we can only hold one
- # and it will be recognized as the default namespace by soupsieve,
- # which may be confusing in some situations. When no namespace is provided
- # for a selector, the default namespace (if defined) is assumed.
-
soup = self.soup(
'<?xml version="1.1"?>\n'
'<root>'
'<tag xmlns="http://unprefixed-namespace.com">content</tag>'
- '<prefix:tag xmlns:prefix="http://prefixed-namespace.com">content</tag>'
+ '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</prefix:tag2>'
+ '<prefix2:tag3 xmlns:prefix2="http://another-namespace.com">'
+ '<subtag xmlns="http://another-unprefixed-namespace.com">'
+ '<subsubtag xmlns="http://yet-another-unprefixed-namespace.com">'
+ '</prefix2:tag3>'
'</root>'
)
- assert soup._namespaces == (
- {'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'}
+
+ # The BeautifulSoup object includes every namespace prefix
+ # defined in the entire document. This is the default set of
+ # namespaces used by soupsieve.
+ #
+ # Un-prefixed namespaces are not included, and if a given
+ # prefix is defined twice, only the first prefix encountered
+ # in the document shows up here.
+ assert soup._namespaces == {
+ 'xml': 'http://www.w3.org/XML/1998/namespace',
+ 'prefix': 'http://prefixed-namespace.com',
+ 'prefix2': 'http://another-namespace.com'
+ }
+
+ # A Tag object includes only the namespace prefixes
+ # that were in scope when it was parsed.
+
+ # We do not track un-prefixed namespaces as we can only hold
+ # one (the first one), and it will be recognized as the
+ # default namespace by soupsieve, even when operating from a
+ # tag with a different un-prefixed namespace.
+ assert soup.tag._namespaces == {
+ 'xml': 'http://www.w3.org/XML/1998/namespace',
+ }
+
+ assert soup.tag2._namespaces == {
+ 'prefix': 'http://prefixed-namespace.com',
+ 'xml': 'http://www.w3.org/XML/1998/namespace',
+ }
+
+ assert soup.subtag._namespaces == {
+ 'prefix2': 'http://another-namespace.com',
+ 'xml': 'http://www.w3.org/XML/1998/namespace',
+ }
+
+ assert soup.subsubtag._namespaces == {
+ 'prefix2': 'http://another-namespace.com',
+ 'xml': 'http://www.w3.org/XML/1998/namespace',
+ }
+
+
+ def test_namespace_interaction_with_select_and_find(self):
+ # Demonstrate how namespaces interact with select* and
+ # find* methods.
+
+ soup = self.soup(
+ '<?xml version="1.1"?>\n'
+ '<root>'
+ '<tag xmlns="http://unprefixed-namespace.com">content</tag>'
+ '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</tag>'
+ '<subtag xmlns:prefix="http://another-namespace-same-prefix.com">'
+ '<prefix:tag3>'
+ '</subtag>'
+ '</root>'
)
+
+ # soupselect uses namespace URIs.
+ assert soup.select_one('tag').name == 'tag'
+ assert soup.select_one('prefix|tag2').name == 'tag2'
+
+ # If a prefix is declared more than once, only the first usage
+ # is registered with the BeautifulSoup object.
+ assert soup.select_one('prefix|tag3') is None
+
+ # But you can always explicitly specify a namespace dictionary.
+ assert soup.select_one(
+ 'prefix|tag3', namespaces=soup.subtag._namespaces
+ ).name == 'tag3'
+
+ # And a Tag (as opposed to the BeautifulSoup object) will
+ # have a set of default namespaces scoped to that Tag.
+ assert soup.subtag.select_one('prefix|tag3').name=='tag3'
+
+ # the find() methods aren't fully namespace-aware; they just
+ # look at prefixes.
+ assert soup.find('tag').name == 'tag'
+ assert soup.find('prefix:tag2').name == 'tag2'
+ assert soup.find('prefix:tag3').name == 'tag3'
+ assert soup.subtag.find('prefix:tag3').name == 'tag3'
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 67d3648..d81fccd 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -1800,9 +1800,9 @@ selectors.::
# [<ns1:child>I'm in namespace 1</ns1:child>]
When handling a CSS selector that uses namespaces, Beautiful Soup
-uses the namespace abbreviations it found when parsing the
-document. You can override this by passing in your own dictionary of
-abbreviations::
+always tries to use namespace prefixes that make sense based on what
+it saw while parsing the document. You can always provide your own
+dictionary of abbreviations::
namespaces = dict(first="http://namespace1/", second="http://namespace2/")
soup.select("second|child", namespaces=namespaces)