diff options
author | Leonard Richardson <leonardr@segfault.org> | 2021-11-29 22:13:33 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2021-11-29 22:13:33 -0500 |
commit | ad52722cc6b55ce414d395e9a0860cee57c0ab2d (patch) | |
tree | 8ff820b41d9ee5fb1f896629782270349cd8311b /bs4/tests/test_lxml.py | |
parent | c005e9ba28b4eec3a5fab173b928609bc692dd51 (diff) |
Do a better job of keeping track of namespaces as an XML document is
parsed, so that CSS selectors that use namespaces will do the right
thing more often. [bug=1946243]
Diffstat (limited to 'bs4/tests/test_lxml.py')
-rw-r--r-- | bs4/tests/test_lxml.py | 91 |
1 files changed, 83 insertions, 8 deletions
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index be954db..41319d1 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -95,18 +95,93 @@ class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest): return LXMLTreeBuilderForXML def test_namespace_indexing(self): - # We should not track un-prefixed namespaces as we can only hold one - # and it will be recognized as the default namespace by soupsieve, - # which may be confusing in some situations. When no namespace is provided - # for a selector, the default namespace (if defined) is assumed. - soup = self.soup( '<?xml version="1.1"?>\n' '<root>' '<tag xmlns="http://unprefixed-namespace.com">content</tag>' - '<prefix:tag xmlns:prefix="http://prefixed-namespace.com">content</tag>' + '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</prefix:tag2>' + '<prefix2:tag3 xmlns:prefix2="http://another-namespace.com">' + '<subtag xmlns="http://another-unprefixed-namespace.com">' + '<subsubtag xmlns="http://yet-another-unprefixed-namespace.com">' + '</prefix2:tag3>' '</root>' ) - assert soup._namespaces == ( - {'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'} + + # The BeautifulSoup object includes every namespace prefix + # defined in the entire document. This is the default set of + # namespaces used by soupsieve. + # + # Un-prefixed namespaces are not included, and if a given + # prefix is defined twice, only the first prefix encountered + # in the document shows up here. + assert soup._namespaces == { + 'xml': 'http://www.w3.org/XML/1998/namespace', + 'prefix': 'http://prefixed-namespace.com', + 'prefix2': 'http://another-namespace.com' + } + + # A Tag object includes only the namespace prefixes + # that were in scope when it was parsed. + + # We do not track un-prefixed namespaces as we can only hold + # one (the first one), and it will be recognized as the + # default namespace by soupsieve, even when operating from a + # tag with a different un-prefixed namespace. + assert soup.tag._namespaces == { + 'xml': 'http://www.w3.org/XML/1998/namespace', + } + + assert soup.tag2._namespaces == { + 'prefix': 'http://prefixed-namespace.com', + 'xml': 'http://www.w3.org/XML/1998/namespace', + } + + assert soup.subtag._namespaces == { + 'prefix2': 'http://another-namespace.com', + 'xml': 'http://www.w3.org/XML/1998/namespace', + } + + assert soup.subsubtag._namespaces == { + 'prefix2': 'http://another-namespace.com', + 'xml': 'http://www.w3.org/XML/1998/namespace', + } + + + def test_namespace_interaction_with_select_and_find(self): + # Demonstrate how namespaces interact with select* and + # find* methods. + + soup = self.soup( + '<?xml version="1.1"?>\n' + '<root>' + '<tag xmlns="http://unprefixed-namespace.com">content</tag>' + '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</tag>' + '<subtag xmlns:prefix="http://another-namespace-same-prefix.com">' + '<prefix:tag3>' + '</subtag>' + '</root>' ) + + # soupselect uses namespace URIs. + assert soup.select_one('tag').name == 'tag' + assert soup.select_one('prefix|tag2').name == 'tag2' + + # If a prefix is declared more than once, only the first usage + # is registered with the BeautifulSoup object. + assert soup.select_one('prefix|tag3') is None + + # But you can always explicitly specify a namespace dictionary. + assert soup.select_one( + 'prefix|tag3', namespaces=soup.subtag._namespaces + ).name == 'tag3' + + # And a Tag (as opposed to the BeautifulSoup object) will + # have a set of default namespaces scoped to that Tag. + assert soup.subtag.select_one('prefix|tag3').name=='tag3' + + # the find() methods aren't fully namespace-aware; they just + # look at prefixes. + assert soup.find('tag').name == 'tag' + assert soup.find('prefix:tag2').name == 'tag2' + assert soup.find('prefix:tag3').name == 'tag3' + assert soup.subtag.find('prefix:tag3').name == 'tag3' |