summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bs4/__init__.py8
-rw-r--r--bs4/builder/_html5lib.py12
-rw-r--r--bs4/builder/_htmlparser.py3
-rw-r--r--bs4/builder/_lxml.py3
-rw-r--r--bs4/element.py21
5 files changed, 31 insertions, 16 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 98ac57b..786b57b 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -193,9 +193,9 @@ class BeautifulSoup(Tag):
self.tagStack = []
self.pushTag(self)
- def new_tag(self, name, **attrs):
+ def new_tag(self, name, namespace=None, **attrs):
"""Create a new tag associated with this soup."""
- return Tag(None, self.builder, name, attrs)
+ return Tag(None, self.builder, name, namespace, attrs)
def new_string(self, s):
"""Create a new NavigableString associated with this soup."""
@@ -272,7 +272,7 @@ class BeautifulSoup(Tag):
mostRecentTag = self.popTag()
return mostRecentTag
- def handle_starttag(self, name, attrs):
+ def handle_starttag(self, name, namespace, attrs):
"""Push a start tag on to the stack.
If this method returns None, the tag was rejected by the
@@ -289,7 +289,7 @@ class BeautifulSoup(Tag):
or not self.parse_only.search_tag(name, attrs))):
return None
- tag = Tag(self, self.builder, name, attrs, self.currentTag,
+ tag = Tag(self, self.builder, name, namespace, attrs, self.currentTag,
self.previous_element)
if tag is None:
return tag
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 0d7a1a9..7ce69aa 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -8,6 +8,7 @@ from bs4.builder import (
HTML_5,
HTMLTreeBuilder,
)
+from bs4.element import NamespacedAttribute
import html5lib
from html5lib.constants import (
DataLossWarning,
@@ -58,9 +59,6 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
def __init__(self, soup, namespaceHTMLElements):
self.soup = soup
- if namespaceHTMLElements:
- warnings.warn("namespaceHTMLElements not supported yet",
- DataLossWarning)
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
def documentClass(self):
@@ -76,9 +74,7 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
self.soup.object_was_parsed(doctype)
def elementClass(self, name, namespace):
- if namespace is not None:
- warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
- tag = self.soup.new_tag(name)
+ tag = self.soup.new_tag(name, namespace)
return Element(tag, self.soup, namespace)
def commentClass(self, data):
@@ -144,6 +140,8 @@ class Element(html5lib.treebuilders._base.Node):
def setAttributes(self, attributes):
if attributes is not None and attributes != {}:
for name, value in list(attributes.items()):
+ if isinstance(name, tuple):
+ name = NamespacedAttribute(*name)
self.element[name] = value
# The attributes may contain variables that need substitution.
# Call set_up_substitutions manually.
@@ -189,7 +187,7 @@ class Element(html5lib.treebuilders._base.Node):
TextNode(child, self.soup))
def cloneNode(self):
- tag = self.soup.new_tag(self.element.name)
+ tag = self.soup.new_tag(self.element.name, self.namespace)
node = Element(tag, self.soup, self.namespace)
for key,value in self.attributes:
node.attributes[key] = value
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index c785eed..ec6d456 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -68,7 +68,8 @@ class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder):
super(HTMLParserTreeBuilder, self).feed(markup)
def handle_starttag(self, name, attrs):
- self.soup.handle_starttag(name, dict(attrs))
+ # XXX namespace
+ self.soup.handle_starttag(name, None, dict(attrs))
def handle_endtag(self, name):
self.soup.handle_endtag(name)
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index cc3cb86..985a030 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -66,7 +66,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
pass
def start(self, name, attrs):
- self.soup.handle_starttag(name, attrs)
+ # XXX namespace
+ self.soup.handle_starttag(name, None, attrs)
def end(self, name):
self.soup.endData()
diff --git a/bs4/element.py b/bs4/element.py
index 513407c..926fb8f 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -22,6 +22,20 @@ def _alias(attr):
return alias
+class NamespacedAttribute(object):
+
+ def __init__(self, namespace_abbreviation, name, namespace):
+ self.namespace_abbreviation = namespace_abbreviation
+ self.name = name
+ self.namespace = namespace
+
+ def __str__(self):
+ name = self.name
+ if self.namespace_abbreviation:
+ name = self.namespace_abbreviation + ":" + name
+ return name
+
+
class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
@@ -507,8 +521,8 @@ class Tag(PageElement):
"""Represents a found HTML tag with its attributes and contents."""
- def __init__(self, parser=None, builder=None, name=None, attrs=None,
- parent=None, previous=None):
+ def __init__(self, parser=None, builder=None, name=None, namespace=None,
+ attrs=None, parent=None, previous=None):
"Basic constructor."
if parser is None:
@@ -520,6 +534,7 @@ class Tag(PageElement):
if name is None:
raise ValueError("No value provided for new tag's name.")
self.name = name
+ self.namespace = namespace
if attrs is None:
attrs = {}
else:
@@ -779,7 +794,7 @@ class Tag(PageElement):
and '%SOUP-ENCODING%' in val):
val = self.substitute_encoding(val, eventual_encoding)
- decoded = (key + '='
+ decoded = (str(key) + '='
+ EntitySubstitution.substitute_xml(val, True))
attrs.append(decoded)
close = ''