4 files changed, 71 insertions, 21 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index f8fce56..601979b 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -1,9 +1,13 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
 from collections import defaultdict
 import itertools
 import sys
 from bs4.element import (
     CharsetMetaAttributeValue,
     ContentMetaAttributeValue,
+    HTMLAwareEntitySubstitution,
     whitespace_re
     )
 
@@ -227,7 +231,7 @@ class HTMLTreeBuilder(TreeBuilder):
     Such as which tags are empty-element tags.
     """
 
-    preserve_whitespace_tags = set(['pre', 'textarea'])
+    preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
     empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
                               'spacer', 'link', 'frame', 'base'])
 
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 755518d..5f54893 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -1,8 +1,10 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
 __all__ = [
     'HTML5TreeBuilder',
     ]
 
-from pdb import set_trace
 import warnings
 import re
 from bs4.builder import (
@@ -27,6 +29,15 @@ from bs4.element import (
     Tag,
     )
 
+try:
+    # Pre-0.99999999
+    from html5lib.treebuilders import _base as treebuilder_base
+    new_html5lib = False
+except ImportError, e:
+    # 0.99999999 and up
+    from html5lib.treebuilders import base as treebuilder_base
+    new_html5lib = True
+
 class HTML5TreeBuilder(HTMLTreeBuilder):
     """Use html5lib to build a tree."""
 
@@ -51,7 +62,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
         if self.soup.parse_only is not None:
             warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
         parser = html5lib.HTMLParser(tree=self.create_treebuilder)
-        doc = parser.parse(markup, encoding=self.user_specified_encoding)
+
+        extra_kwargs = dict()
+        if not isinstance(markup, unicode):
+            if new_html5lib:
+                extra_kwargs['override_encoding'] = self.user_specified_encoding
+            else:
+                extra_kwargs['encoding'] = self.user_specified_encoding
+        doc = parser.parse(markup, **extra_kwargs)
 
         # Set the character encoding detected by the tokenizer.
         if isinstance(markup, unicode):
@@ -59,7 +77,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
             # charEncoding to UTF-8 if it gets Unicode input.
             doc.original_encoding = None
         else:
-            doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
+            original_encoding = parser.tokenizer.stream.charEncoding[0]
+            if not isinstance(original_encoding, basestring):
+                # In 0.99999999 and up, the encoding is an html5lib
+                # Encoding object. We want to use a string for compatibility
+                # with other tree builders.
+                original_encoding = original_encoding.name
+            doc.original_encoding = original_encoding
 
     def create_treebuilder(self, namespaceHTMLElements):
         self.underlying_builder = TreeBuilderForHtml5lib(
@@ -71,7 +95,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
         return u'<html><head></head><body>%s</body></html>' % fragment
 
 
-class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
+class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
 
     def __init__(self, namespaceHTMLElements, soup=None):
         if soup:
@@ -114,7 +138,7 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
         return self.soup
 
     def getFragment(self):
-        return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
+        return treebuilder_base.TreeBuilder.getFragment(self).element
 
     def testSerializer(self, element):
         from bs4 import BeautifulSoup
@@ -196,9 +220,9 @@ class AttrList(object):
         return name in list(self.attrs.keys())
 
 
-class Element(html5lib.treebuilders._base.Node):
+class Element(treebuilder_base.Node):
     def __init__(self, element, soup, namespace):
-        html5lib.treebuilders._base.Node.__init__(self, element.name)
+        treebuilder_base.Node.__init__(self, element.name)
         self.element = element
         self.soup = soup
         self.namespace = namespace
@@ -258,6 +282,8 @@ class Element(html5lib.treebuilders._base.Node):
                 most_recent_element=most_recent_element)
 
     def getAttributes(self):
+        if isinstance(self.element, Comment):
+            return {}
         return AttrList(self.element)
 
     def setAttributes(self, attributes):
@@ -311,6 +337,7 @@ class Element(html5lib.treebuilders._base.Node):
         # print "MOVE", self.element.contents
         # print "FROM", self.element
         # print "TO", new_parent.element
+
         element = self.element
         new_parent_element = new_parent.element
         # Determine what this tag's next_element will be once all the children
@@ -329,7 +356,6 @@ class Element(html5lib.treebuilders._base.Node):
             new_parents_last_descendant_next_element = new_parent_element.next_element
 
         to_append = element.contents
-        append_after = new_parent_element.contents
         if len(to_append) > 0:
             # Set the first child's previous_element and previous_sibling
             # to elements within the new parent
@@ -346,12 +372,19 @@ class Element(html5lib.treebuilders._base.Node):
             if new_parents_last_child:
                 new_parents_last_child.next_sibling = first_child
 
-            # Fix the last child's next_element and next_sibling
-            last_child = to_append[-1]
-            last_child.next_element = new_parents_last_descendant_next_element
+            # Find the very last element being moved. It is now the
+            # parent's last descendant. It has no .next_sibling and
+            # its .next_element is whatever the previous last
+            # descendant had.
+            last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
+
+            last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
             if new_parents_last_descendant_next_element:
-                new_parents_last_descendant_next_element.previous_element = last_child
-            last_child.next_sibling = None
+                # TODO: This code has no test coverage and I'm not sure
+                # how to get html5lib to go through this path, but it's
+                # just the other side of the previous line.
+                new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
+            last_childs_last_descendant.next_sibling = None
 
         for child in to_append:
             child.parent = new_parent_element
@@ -385,7 +418,7 @@ class Element(html5lib.treebuilders._base.Node):
 
 class TextNode(Element):
     def __init__(self, element, soup):
-        html5lib.treebuilders._base.Node.__init__(self, None)
+        treebuilder_base.Node.__init__(self, None)
         self.element = element
         self.soup = soup
 
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 0101d64..823ca15 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -1,5 +1,8 @@
 """Use the HTMLParser library to parse HTML files that aren't too bad."""
 
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
 __all__ = [
     'HTMLParserTreeBuilder',
     ]
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 9e8f88f..d2ca287 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -1,3 +1,5 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
 __all__ = [
     'LXMLTreeBuilderForXML',
     'LXMLTreeBuilder',
@@ -12,6 +14,7 @@ from bs4.element import (
     Doctype,
     NamespacedAttribute,
     ProcessingInstruction,
+    XMLProcessingInstruction,
 )
 from bs4.builder import (
     FAST,
@@ -29,6 +32,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
     DEFAULT_PARSER_CLASS = etree.XMLParser
 
     is_xml = True
+    processing_instruction_class = XMLProcessingInstruction
 
     NAME = "lxml-xml"
     ALTERNATE_NAMES = ["xml"]
@@ -87,6 +91,16 @@ class LXMLTreeBuilderForXML(TreeBuilder):
 
         Each 4-tuple represents a strategy for parsing the document.
         """
+        # Instead of using UnicodeDammit to convert the bytestring to
+        # Unicode using different encodings, use EncodingDetector to
+        # iterate over the encodings, and tell lxml to try to parse
+        # the document as each one in turn.
+        is_html = not self.is_xml
+        if is_html:
+            self.processing_instruction_class = ProcessingInstruction
+        else:
+            self.processing_instruction_class = XMLProcessingInstruction
+
         if isinstance(markup, unicode):
             # We were given Unicode. Maybe lxml can parse Unicode on
             # this system?
@@ -98,11 +112,6 @@ class LXMLTreeBuilderForXML(TreeBuilder):
             yield (markup.encode("utf8"), "utf8",
                    document_declared_encoding, False)
 
-        # Instead of using UnicodeDammit to convert the bytestring to
-        # Unicode using different encodings, use EncodingDetector to
-        # iterate over the encodings, and tell lxml to try to parse
-        # the document as each one in turn.
-        is_html = not self.is_xml
         try_encodings = [user_specified_encoding, document_declared_encoding]
         detector = EncodingDetector(
             markup, try_encodings, is_html, exclude_encodings)
@@ -201,7 +210,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
     def pi(self, target, data):
         self.soup.endData()
         self.soup.handle_data(target + ' ' + data)
-        self.soup.endData(ProcessingInstruction)
+        self.soup.endData(self.processing_instruction_class)
 
     def data(self, content):
         self.soup.handle_data(content)
@@ -229,6 +238,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
 
     features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
     is_xml = False
+    processing_instruction_class = ProcessingInstruction
 
     def default_parser(self, encoding):
         return etree.HTMLParser