summaryrefslogtreecommitdiff
path: root/bs4/builder
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/builder')
-rw-r--r--bs4/builder/_html5lib.py12
-rw-r--r--bs4/builder/_htmlparser.py3
-rw-r--r--bs4/builder/_lxml.py53
3 files changed, 55 insertions, 13 deletions
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 0d7a1a9..7ce69aa 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -8,6 +8,7 @@ from bs4.builder import (
HTML_5,
HTMLTreeBuilder,
)
+from bs4.element import NamespacedAttribute
import html5lib
from html5lib.constants import (
DataLossWarning,
@@ -58,9 +59,6 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
def __init__(self, soup, namespaceHTMLElements):
self.soup = soup
- if namespaceHTMLElements:
- warnings.warn("namespaceHTMLElements not supported yet",
- DataLossWarning)
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
def documentClass(self):
@@ -76,9 +74,7 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
self.soup.object_was_parsed(doctype)
def elementClass(self, name, namespace):
- if namespace is not None:
- warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
- tag = self.soup.new_tag(name)
+ tag = self.soup.new_tag(name, namespace)
return Element(tag, self.soup, namespace)
def commentClass(self, data):
@@ -144,6 +140,8 @@ class Element(html5lib.treebuilders._base.Node):
def setAttributes(self, attributes):
if attributes is not None and attributes != {}:
for name, value in list(attributes.items()):
+ if isinstance(name, tuple):
+ name = NamespacedAttribute(*name)
self.element[name] = value
# The attributes may contain variables that need substitution.
# Call set_up_substitutions manually.
@@ -189,7 +187,7 @@ class Element(html5lib.treebuilders._base.Node):
TextNode(child, self.soup))
def cloneNode(self):
- tag = self.soup.new_tag(self.element.name)
+ tag = self.soup.new_tag(self.element.name, self.namespace)
node = Element(tag, self.soup, self.namespace)
for key,value in self.attributes:
node.attributes[key] = value
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 62473cf..c307ff8 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -40,7 +40,8 @@ HTMLPARSER = 'html.parser'
class BeautifulSoupHTMLParser(HTMLParser):
def handle_starttag(self, name, attrs):
- self.soup.handle_starttag(name, dict(attrs))
+ # XXX namespace
+ self.soup.handle_starttag(name, None, None, dict(attrs))
def handle_endtag(self, name):
self.soup.handle_endtag(name)
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index cc3cb86..870d59e 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -5,7 +5,7 @@ __all__ = [
import collections
from lxml import etree
-from bs4.element import Comment, Doctype
+from bs4.element import Comment, Doctype, NamespacedAttribute
from bs4.builder import (
FAST,
HTML,
@@ -42,6 +42,15 @@ class LXMLTreeBuilderForXML(TreeBuilder):
parser = parser(target=self, strip_cdata=False)
self.parser = parser
self.soup = None
+ self.nsmaps = None
+
+ def _getNsTag(self, tag):
+ # Split the namespace URL out of a fully-qualified lxml tag
+ # name. Copied from lxml's src/lxml/sax.py.
+ if tag[0] == '{':
+ return tuple(tag[1:].split('}', 1))
+ else:
+ return (None, tag)
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
@@ -63,15 +72,49 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.parser.close()
def close(self):
- pass
-
- def start(self, name, attrs):
- self.soup.handle_starttag(name, attrs)
+ self.nsmaps = None
+
+ def start(self, name, attrs, nsmap={}):
+ nsprefix = None
+ # Invert each namespace map as it comes in.
+ if len(nsmap) == 0 and self.nsmaps != None:
+ # There are no new namespaces for this tag, but namespaces
+ # are in play, so we need a separate tag stack to know
+ # when they end.
+ self.nsmaps.append(None)
+ elif len(nsmap) > 0:
+ # A new namespace mapping has come into play.
+ if self.nsmaps is None:
+ self.nsmaps = []
+ inverted_nsmap = dict((value, key) for key, value in nsmap.items())
+ self.nsmaps.append(inverted_nsmap)
+ # Also treat the namespace mapping as a set of attributes on the
+ # tag, so we can recreate it later.
+ attrs = attrs.copy()
+ for prefix, namespace in nsmap.items():
+ attribute = NamespacedAttribute(
+ "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
+ attrs[attribute] = namespace
+ namespace, name = self._getNsTag(name)
+ if namespace is not None:
+ for inverted_nsmap in reversed(self.nsmaps):
+ if inverted_nsmap is not None and namespace in inverted_nsmap:
+ nsprefix = inverted_nsmap[namespace]
+ break
+ self.soup.handle_starttag(name, namespace, nsprefix, attrs)
def end(self, name):
self.soup.endData()
completed_tag = self.soup.tagStack[-1]
self.soup.handle_endtag(name)
+ if self.nsmaps != None:
+ # This tag, or one of its parents, introduced a namespace
+ # mapping, so pop it off the stack.
+ self.nsmaps.pop()
+ if len(self.nsmaps) == 0:
+ # Namespaces are no longer in play, so don't bother keeping
+ # track of the namespace stack.
+ self.nsmaps = None
def pi(self, target, data):
pass