Defer to html5lib's Unicode converter rather than using Unicode, Dammit. The lxml treebuilder still uses UD.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-18 12:10:10 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-18 12:10:10 -0500
commit: 0dda99b15112df7225e647db9702fbd62dcc8ea8 (patch)
tree: 1127d44d52716738835c6ab2128fdb1561bc7cc2 /tests/test_html5lib.py
parent: 66cbef12d959149746b3361f227f2a0328a31469 (diff)
parent: 945b719a28c229178e710b749d2af4d00a81bdba (diff)
1 files changed, 17 insertions, 0 deletions
diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py
index 3efdebf..59d84a3 100644
--- a/tests/test_html5lib.py
+++ b/tests/test_html5lib.py
@@ -3,6 +3,7 @@ from beautifulsoup.element import Comment
 from test_lxml import (
     TestLXMLBuilder,
     TestLXMLBuilderInvalidMarkup,
+    TestLXMLBuilderEncodingConversion,
     )
 
 class TestHTML5Builder(TestLXMLBuilder):
@@ -138,3 +139,19 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
         utf8 = utf8.replace("\xe9", "\xc3\xa9")
 
         #print soup
+
+
+class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion):
+    @property
+    def default_builder(self):
+        return HTML5TreeBuilder()
+
+    def test_real_hebrew_document(self):
+        # A real-world test to make sure we can convert ISO-8859-9 (a
+        # Hebrew encoding) to UTF-8.
+        soup = self.soup(self.HEBREW_DOCUMENT,
+                         fromEncoding="iso-8859-8")
+        self.assertEquals(soup.originalEncoding, 'iso8859-8')
+        self.assertEquals(
+            soup.encode('utf-8'),
+            self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8"))
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-18 12:10:10 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-18 12:10:10 -0500
commit	0dda99b15112df7225e647db9702fbd62dcc8ea8 (patch)
tree	1127d44d52716738835c6ab2128fdb1561bc7cc2 /tests/test_html5lib.py
parent	66cbef12d959149746b3361f227f2a0328a31469 (diff)
parent	945b719a28c229178e710b749d2af4d00a81bdba (diff)