diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-16 16:33:40 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-16 16:33:40 -0500 |
commit | e1b321db7331752a3aea8dd7070dd0db4c60c51d (patch) | |
tree | 96ce9539fb1e0dbb1dc8b4f264737159e9a2cd1d | |
parent | 1a50d9623831990ae0a78ea3a7e66fa098fe92ac (diff) |
It's a start, at least.
-rw-r--r-- | bs4/__init__.py | 8 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 12 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 3 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 3 | ||||
-rw-r--r-- | bs4/element.py | 21 |
5 files changed, 31 insertions, 16 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index 98ac57b..786b57b 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -193,9 +193,9 @@ class BeautifulSoup(Tag): self.tagStack = [] self.pushTag(self) - def new_tag(self, name, **attrs): + def new_tag(self, name, namespace=None, **attrs): """Create a new tag associated with this soup.""" - return Tag(None, self.builder, name, attrs) + return Tag(None, self.builder, name, namespace, attrs) def new_string(self, s): """Create a new NavigableString associated with this soup.""" @@ -272,7 +272,7 @@ class BeautifulSoup(Tag): mostRecentTag = self.popTag() return mostRecentTag - def handle_starttag(self, name, attrs): + def handle_starttag(self, name, namespace, attrs): """Push a start tag on to the stack. If this method returns None, the tag was rejected by the @@ -289,7 +289,7 @@ class BeautifulSoup(Tag): or not self.parse_only.search_tag(name, attrs))): return None - tag = Tag(self, self.builder, name, attrs, self.currentTag, + tag = Tag(self, self.builder, name, namespace, attrs, self.currentTag, self.previous_element) if tag is None: return tag diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 0d7a1a9..7ce69aa 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -8,6 +8,7 @@ from bs4.builder import ( HTML_5, HTMLTreeBuilder, ) +from bs4.element import NamespacedAttribute import html5lib from html5lib.constants import ( DataLossWarning, @@ -58,9 +59,6 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): def __init__(self, soup, namespaceHTMLElements): self.soup = soup - if namespaceHTMLElements: - warnings.warn("namespaceHTMLElements not supported yet", - DataLossWarning) super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) def documentClass(self): @@ -76,9 +74,7 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): self.soup.object_was_parsed(doctype) def elementClass(self, name, namespace): - if namespace is not None: - warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) - tag = self.soup.new_tag(name) + tag = self.soup.new_tag(name, namespace) return Element(tag, self.soup, namespace) def commentClass(self, data): @@ -144,6 +140,8 @@ class Element(html5lib.treebuilders._base.Node): def setAttributes(self, attributes): if attributes is not None and attributes != {}: for name, value in list(attributes.items()): + if isinstance(name, tuple): + name = NamespacedAttribute(*name) self.element[name] = value # The attributes may contain variables that need substitution. # Call set_up_substitutions manually. @@ -189,7 +187,7 @@ class Element(html5lib.treebuilders._base.Node): TextNode(child, self.soup)) def cloneNode(self): - tag = self.soup.new_tag(self.element.name) + tag = self.soup.new_tag(self.element.name, self.namespace) node = Element(tag, self.soup, self.namespace) for key,value in self.attributes: node.attributes[key] = value diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index c785eed..ec6d456 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -68,7 +68,8 @@ class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder): super(HTMLParserTreeBuilder, self).feed(markup) def handle_starttag(self, name, attrs): - self.soup.handle_starttag(name, dict(attrs)) + # XXX namespace + self.soup.handle_starttag(name, None, dict(attrs)) def handle_endtag(self, name): self.soup.handle_endtag(name) diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index cc3cb86..985a030 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -66,7 +66,8 @@ class LXMLTreeBuilderForXML(TreeBuilder): pass def start(self, name, attrs): - self.soup.handle_starttag(name, attrs) + # XXX namespace + self.soup.handle_starttag(name, None, attrs) def end(self, name): self.soup.endData() diff --git a/bs4/element.py b/bs4/element.py index 513407c..926fb8f 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -22,6 +22,20 @@ def _alias(attr): return alias +class NamespacedAttribute(object): + + def __init__(self, namespace_abbreviation, name, namespace): + self.namespace_abbreviation = namespace_abbreviation + self.name = name + self.namespace = namespace + + def __str__(self): + name = self.name + if self.namespace_abbreviation: + name = self.namespace_abbreviation + ":" + name + return name + + class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -507,8 +521,8 @@ class Tag(PageElement): """Represents a found HTML tag with its attributes and contents.""" - def __init__(self, parser=None, builder=None, name=None, attrs=None, - parent=None, previous=None): + def __init__(self, parser=None, builder=None, name=None, namespace=None, + attrs=None, parent=None, previous=None): "Basic constructor." if parser is None: @@ -520,6 +534,7 @@ class Tag(PageElement): if name is None: raise ValueError("No value provided for new tag's name.") self.name = name + self.namespace = namespace if attrs is None: attrs = {} else: @@ -779,7 +794,7 @@ class Tag(PageElement): and '%SOUP-ENCODING%' in val): val = self.substitute_encoding(val, eventual_encoding) - decoded = (key + '=' + decoded = (str(key) + '=' + EntitySubstitution.substitute_xml(val, True)) attrs.append(decoded) close = '' |