Pass the user-specified encoding in to html5lib rather than dropping it on the floor.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-18 11:29:43 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-18 11:29:43 -0500
commit: 75c5891980c961dfe36745c1934010560666f938 (patch)
tree: fa62ea107db97916fa538883ae561fba64ea13d9
parent: ddf9d04e42168fdb25b742b35efc891789a4b6c9 (diff)
3 files changed, 22 insertions, 6 deletions
diff --git a/beautifulsoup/builder/html5lib_builder.py b/beautifulsoup/builder/html5lib_builder.py
index a5aaa01..bb0e374 100644
--- a/beautifulsoup/builder/html5lib_builder.py
+++ b/beautifulsoup/builder/html5lib_builder.py
@@ -13,10 +13,15 @@ from beautifulsoup.element import (
 class HTML5TreeBuilder(HTMLTreeBuilder):
     """Use html5lib to build a tree."""
 
+    def prepare_markup(self, markup, user_specified_encoding):
+        # Store the user-specified encoding for use later on.
+        self.user_specified_encoding = user_specified_encoding
+        return markup, None, None
+
     # These methods are defined by Beautiful Soup.
     def feed(self, markup):
         parser = html5lib.HTMLParser(tree=self.create_treebuilder)
-        doc = parser.parse(markup)
+        doc = parser.parse(markup, encoding=self.user_specified_encoding)
 
         # Set the character encoding detected by the tokenizer.
         doc.originalEncoding = parser.tokenizer.stream.charEncoding[0]
diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py
index 1034720..59d84a3 100644
--- a/tests/test_html5lib.py
+++ b/tests/test_html5lib.py
@@ -146,4 +146,12 @@ class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion):
     def default_builder(self):
         return HTML5TreeBuilder()
 
-    pass
+    def test_real_hebrew_document(self):
+        # A real-world test to make sure we can convert ISO-8859-9 (a
+        # Hebrew encoding) to UTF-8.
+        soup = self.soup(self.HEBREW_DOCUMENT,
+                         fromEncoding="iso-8859-8")
+        self.assertEquals(soup.originalEncoding, 'iso8859-8')
+        self.assertEquals(
+            soup.encode('utf-8'),
+            self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8"))
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 98dd8c2..a1f156a 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -421,11 +421,14 @@ class TestLXMLBuilderEncodingConversion(SoupTest):
         soup_from_unicode = self.soup(self.unicode_data)
         self.assertEquals(soup_from_unicode.encode('utf-8'), self.utf8_data)
 
+    HEBREW_DOCUMENT = '<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
+
     def test_real_hebrew_document(self):
         # A real-world test to make sure we can convert ISO-8859-9 (a
         # Hebrew encoding) to UTF-8.
-        iso_8859_8= '<HTML><HEAD><TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE></HEAD><BODY><H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\xed\xe5\xec\xf9</BODY></HTML>'
-        utf8 = '<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9</body></html>'
-        soup = self.soup(iso_8859_8, fromEncoding="iso-8859-8")
+        soup = self.soup(self.HEBREW_DOCUMENT,
+                         fromEncoding="iso-8859-8")
         self.assertEquals(soup.originalEncoding, 'iso-8859-8')
-        self.assertEquals(soup.encode('utf-8'), utf8)
+        self.assertEquals(
+            soup.encode('utf-8'),
+            self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8"))
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-18 11:29:43 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-18 11:29:43 -0500
commit	75c5891980c961dfe36745c1934010560666f938 (patch)
tree	fa62ea107db97916fa538883ae561fba64ea13d9
parent	ddf9d04e42168fdb25b742b35efc891789a4b6c9 (diff)