diff options
-rw-r--r-- | beautifulsoup/builder/html5lib_builder.py | 3 | ||||
-rw-r--r-- | tests/test_lxml.py | 7 |
2 files changed, 8 insertions, 2 deletions
diff --git a/beautifulsoup/builder/html5lib_builder.py b/beautifulsoup/builder/html5lib_builder.py index dc95493..a5aaa01 100644 --- a/beautifulsoup/builder/html5lib_builder.py +++ b/beautifulsoup/builder/html5lib_builder.py @@ -18,6 +18,9 @@ class HTML5TreeBuilder(HTMLTreeBuilder): parser = html5lib.HTMLParser(tree=self.create_treebuilder) doc = parser.parse(markup) + # Set the character encoding detected by the tokenizer. + doc.originalEncoding = parser.tokenizer.stream.charEncoding[0] + def create_treebuilder(self, namespaceHTMLElements): self.underlying_builder = TreeBuilderForHtml5lib( self.soup, namespaceHTMLElements) diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 1218763..98dd8c2 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -383,10 +383,13 @@ class TestLXMLBuilderEncodingConversion(SoupTest): def setUp(self): super(TestLXMLBuilderEncodingConversion, self).setUp() - self.unicode_data = u"<html><head></head><body><foo>\xe9</foo></body></html>" + self.unicode_data = u"<html><head></head><body><foo>\N{LATIN SMALL LETTER E WITH ACUTE}</foo></body></html>" self.utf8_data = self.unicode_data.encode("utf-8") + + # Just so you know what it looks like. self.assertEqual( - self.utf8_data, "<html><head></head><body><foo>\xc3\xa9</foo></body></html>") + self.utf8_data, + "<html><head></head><body><foo>\xc3\xa9</foo></body></html>") def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The originalEncoding |