summaryrefslogtreecommitdiff
path: root/bs4/builder/_html5lib.py
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/builder/_html5lib.py')
-rw-r--r--bs4/builder/_html5lib.py61
1 files changed, 47 insertions, 14 deletions
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 755518d..5f54893 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -1,8 +1,10 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
__all__ = [
'HTML5TreeBuilder',
]
-from pdb import set_trace
import warnings
import re
from bs4.builder import (
@@ -27,6 +29,15 @@ from bs4.element import (
Tag,
)
+try:
+ # Pre-0.99999999
+ from html5lib.treebuilders import _base as treebuilder_base
+ new_html5lib = False
+except ImportError, e:
+ # 0.99999999 and up
+ from html5lib.treebuilders import base as treebuilder_base
+ new_html5lib = True
+
class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree."""
@@ -51,7 +62,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
- doc = parser.parse(markup, encoding=self.user_specified_encoding)
+
+ extra_kwargs = dict()
+ if not isinstance(markup, unicode):
+ if new_html5lib:
+ extra_kwargs['override_encoding'] = self.user_specified_encoding
+ else:
+ extra_kwargs['encoding'] = self.user_specified_encoding
+ doc = parser.parse(markup, **extra_kwargs)
# Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode):
@@ -59,7 +77,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
else:
- doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
+ original_encoding = parser.tokenizer.stream.charEncoding[0]
+ if not isinstance(original_encoding, basestring):
+ # In 0.99999999 and up, the encoding is an html5lib
+ # Encoding object. We want to use a string for compatibility
+ # with other tree builders.
+ original_encoding = original_encoding.name
+ doc.original_encoding = original_encoding
def create_treebuilder(self, namespaceHTMLElements):
self.underlying_builder = TreeBuilderForHtml5lib(
@@ -71,7 +95,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
return u'<html><head></head><body>%s</body></html>' % fragment
-class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
+class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
def __init__(self, namespaceHTMLElements, soup=None):
if soup:
@@ -114,7 +138,7 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
return self.soup
def getFragment(self):
- return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
+ return treebuilder_base.TreeBuilder.getFragment(self).element
def testSerializer(self, element):
from bs4 import BeautifulSoup
@@ -196,9 +220,9 @@ class AttrList(object):
return name in list(self.attrs.keys())
-class Element(html5lib.treebuilders._base.Node):
+class Element(treebuilder_base.Node):
def __init__(self, element, soup, namespace):
- html5lib.treebuilders._base.Node.__init__(self, element.name)
+ treebuilder_base.Node.__init__(self, element.name)
self.element = element
self.soup = soup
self.namespace = namespace
@@ -258,6 +282,8 @@ class Element(html5lib.treebuilders._base.Node):
most_recent_element=most_recent_element)
def getAttributes(self):
+ if isinstance(self.element, Comment):
+ return {}
return AttrList(self.element)
def setAttributes(self, attributes):
@@ -311,6 +337,7 @@ class Element(html5lib.treebuilders._base.Node):
# print "MOVE", self.element.contents
# print "FROM", self.element
# print "TO", new_parent.element
+
element = self.element
new_parent_element = new_parent.element
# Determine what this tag's next_element will be once all the children
@@ -329,7 +356,6 @@ class Element(html5lib.treebuilders._base.Node):
new_parents_last_descendant_next_element = new_parent_element.next_element
to_append = element.contents
- append_after = new_parent_element.contents
if len(to_append) > 0:
# Set the first child's previous_element and previous_sibling
# to elements within the new parent
@@ -346,12 +372,19 @@ class Element(html5lib.treebuilders._base.Node):
if new_parents_last_child:
new_parents_last_child.next_sibling = first_child
- # Fix the last child's next_element and next_sibling
- last_child = to_append[-1]
- last_child.next_element = new_parents_last_descendant_next_element
+ # Find the very last element being moved. It is now the
+ # parent's last descendant. It has no .next_sibling and
+ # its .next_element is whatever the previous last
+ # descendant had.
+ last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
+
+ last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
if new_parents_last_descendant_next_element:
- new_parents_last_descendant_next_element.previous_element = last_child
- last_child.next_sibling = None
+ # TODO: This code has no test coverage and I'm not sure
+ # how to get html5lib to go through this path, but it's
+ # just the other side of the previous line.
+ new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
+ last_childs_last_descendant.next_sibling = None
for child in to_append:
child.parent = new_parent_element
@@ -385,7 +418,7 @@ class Element(html5lib.treebuilders._base.Node):
class TextNode(Element):
def __init__(self, element, soup):
- html5lib.treebuilders._base.Node.__init__(self, None)
+ treebuilder_base.Node.__init__(self, None)
self.element = element
self.soup = soup