diff options
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 30 | ||||
-rw-r--r-- | bs4/testing.py | 5 |
3 files changed, 33 insertions, 5 deletions
@@ -1,5 +1,8 @@ = 4.1.2 (Unreleased) = +* Use namespace prefixes for namespaced attribute names, instead of + the fully-qualified names given by the lxml parser. [bug=1037597] + * When sniffing encodings, if the cchardet library is installed, use it instead of chardet. It's much faster. [bug=1020748] diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index c78fdff..f6b91ff 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -111,14 +111,34 @@ class LXMLTreeBuilderForXML(TreeBuilder): attribute = NamespacedAttribute( "xmlns", prefix, "http://www.w3.org/2000/xmlns/") attrs[attribute] = namespace + + if self.nsmaps is not None and len(self.nsmaps) > 0: + # Namespaces are in play. Find any attributes that came in + # from lxml with namespaces attached to their names, and + # turn then into NamespacedAttribute objects. + new_attrs = {} + for attr, value in attrs.items(): + namespace, attr = self._getNsTag(attr) + if namespace is None: + new_attrs[attr] = value + else: + nsprefix = self._prefix_for_namespace(namespace) + attr = NamespacedAttribute(nsprefix, attr, namespace) + new_attrs[attr] = value + attrs = new_attrs + namespace, name = self._getNsTag(name) - if namespace is not None: - for inverted_nsmap in reversed(self.nsmaps): - if inverted_nsmap is not None and namespace in inverted_nsmap: - nsprefix = inverted_nsmap[namespace] - break + nsprefix = self._prefix_for_namespace(namespace) self.soup.handle_starttag(name, namespace, nsprefix, attrs) + def _prefix_for_namespace(self, namespace): + """Find the currently active prefix for the given namespace.""" + if namespace is None: + return None + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + return inverted_nsmap[namespace] + def end(self, name): self.soup.endData() completed_tag = self.soup.tagStack[-1] diff --git a/bs4/testing.py b/bs4/testing.py index 4fec24b..30e74f4 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -490,6 +490,11 @@ class XMLTreeBuilderSmokeTest(object): soup = self.soup(markup) self.assertEqual(unicode(soup.p), markup) + def test_namespaced_attributes(self): + markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>' + soup = self.soup(markup) + self.assertEqual(unicode(soup.foo), markup) + class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): """Smoke test for a tree builder that supports HTML5.""" |