diff options
-rw-r--r-- | beautifulsoup/builder/html5lib_builder.py | 7 | ||||
-rw-r--r-- | tests/test_html5lib.py | 10 | ||||
-rw-r--r-- | tests/test_lxml.py | 11 |
3 files changed, 22 insertions, 6 deletions
diff --git a/beautifulsoup/builder/html5lib_builder.py b/beautifulsoup/builder/html5lib_builder.py index a5aaa01..bb0e374 100644 --- a/beautifulsoup/builder/html5lib_builder.py +++ b/beautifulsoup/builder/html5lib_builder.py @@ -13,10 +13,15 @@ from beautifulsoup.element import ( class HTML5TreeBuilder(HTMLTreeBuilder): """Use html5lib to build a tree.""" + def prepare_markup(self, markup, user_specified_encoding): + # Store the user-specified encoding for use later on. + self.user_specified_encoding = user_specified_encoding + return markup, None, None + # These methods are defined by Beautiful Soup. def feed(self, markup): parser = html5lib.HTMLParser(tree=self.create_treebuilder) - doc = parser.parse(markup) + doc = parser.parse(markup, encoding=self.user_specified_encoding) # Set the character encoding detected by the tokenizer. doc.originalEncoding = parser.tokenizer.stream.charEncoding[0] diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py index 1034720..59d84a3 100644 --- a/tests/test_html5lib.py +++ b/tests/test_html5lib.py @@ -146,4 +146,12 @@ class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion): def default_builder(self): return HTML5TreeBuilder() - pass + def test_real_hebrew_document(self): + # A real-world test to make sure we can convert ISO-8859-9 (a + # Hebrew encoding) to UTF-8. + soup = self.soup(self.HEBREW_DOCUMENT, + fromEncoding="iso-8859-8") + self.assertEquals(soup.originalEncoding, 'iso8859-8') + self.assertEquals( + soup.encode('utf-8'), + self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8")) diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 98dd8c2..a1f156a 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -421,11 +421,14 @@ class TestLXMLBuilderEncodingConversion(SoupTest): soup_from_unicode = self.soup(self.unicode_data) self.assertEquals(soup_from_unicode.encode('utf-8'), self.utf8_data) + HEBREW_DOCUMENT = '<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>' + def test_real_hebrew_document(self): # A real-world test to make sure we can convert ISO-8859-9 (a # Hebrew encoding) to UTF-8. - iso_8859_8= '<HTML><HEAD><TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE></HEAD><BODY><H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\xed\xe5\xec\xf9</BODY></HTML>' - utf8 = '<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9</body></html>' - soup = self.soup(iso_8859_8, fromEncoding="iso-8859-8") + soup = self.soup(self.HEBREW_DOCUMENT, + fromEncoding="iso-8859-8") self.assertEquals(soup.originalEncoding, 'iso-8859-8') - self.assertEquals(soup.encode('utf-8'), utf8) + self.assertEquals( + soup.encode('utf-8'), + self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8")) |