Defer to html5lib's Unicode converter rather than using Unicode, Dammit. The lxml treebuilder still uses UD.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-18 12:10:10 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-18 12:10:10 -0500
commit: 0dda99b15112df7225e647db9702fbd62dcc8ea8 (patch)
tree: 1127d44d52716738835c6ab2128fdb1561bc7cc2 /beautifulsoup/builder/html5lib_builder.py
parent: 66cbef12d959149746b3361f227f2a0328a31469 (diff)
parent: 945b719a28c229178e710b749d2af4d00a81bdba (diff)
1 files changed, 14 insertions, 1 deletions
diff --git a/beautifulsoup/builder/html5lib_builder.py b/beautifulsoup/builder/html5lib_builder.py
index dc95493..95151da 100644
--- a/beautifulsoup/builder/html5lib_builder.py
+++ b/beautifulsoup/builder/html5lib_builder.py
@@ -13,10 +13,23 @@ from beautifulsoup.element import (
 class HTML5TreeBuilder(HTMLTreeBuilder):
     """Use html5lib to build a tree."""
 
+    def prepare_markup(self, markup, user_specified_encoding):
+        # Store the user-specified encoding for use later on.
+        self.user_specified_encoding = user_specified_encoding
+        return markup, None, None
+
     # These methods are defined by Beautiful Soup.
     def feed(self, markup):
         parser = html5lib.HTMLParser(tree=self.create_treebuilder)
-        doc = parser.parse(markup)
+        doc = parser.parse(markup, encoding=self.user_specified_encoding)
+
+        # Set the character encoding detected by the tokenizer.
+        if isinstance(markup, unicode):
+            # We need to special-case this because html5lib sets
+            # charEncoding to UTF-8 if it gets Unicode input.
+            doc.originalEncoding = None
+        else:
+            doc.originalEncoding = parser.tokenizer.stream.charEncoding[0]
 
     def create_treebuilder(self, namespaceHTMLElements):
         self.underlying_builder = TreeBuilderForHtml5lib(
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-18 12:10:10 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-18 12:10:10 -0500
commit	0dda99b15112df7225e647db9702fbd62dcc8ea8 (patch)
tree	1127d44d52716738835c6ab2128fdb1561bc7cc2 /beautifulsoup/builder/html5lib_builder.py
parent	66cbef12d959149746b3361f227f2a0328a31469 (diff)
parent	945b719a28c229178e710b749d2af4d00a81bdba (diff)