Create a new lxml parser object for every new parsing strategy.

author: Leonard Richardson <leonardr@segfault.org> 2013-05-31 09:17:11 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2013-05-31 09:17:11 -0400
commit: 19f05a586c79b86be8ebe06a3728ab9a94162bee (patch)
tree: 295326e49419a40a8942dc3b0552e51f97e18abb /bs4/testing.py
parent: 342da7818966498e1fc2100c0b920cbc242c9831 (diff)
1 files changed, 13 insertions, 0 deletions
diff --git a/bs4/testing.py b/bs4/testing.py
index d8ff6b7..c363a89 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -279,6 +279,14 @@ class HTMLTreeBuilderSmokeTest(object):
     # to detect any differences between them.
     #
 
+    def test_can_parse_unicode_document(self):
+        # A seemingly innocuous document... but it's in Unicode! And
+        # it contains characters that can't be represented in the
+        # encoding found in the  declaration! The horror!
+        markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
+        soup = self.soup(markup)
+        self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
+
     def test_soupstrainer(self):
         """Parsers should be able to work with SoupStrainers."""
         strainer = SoupStrainer("b")
@@ -482,6 +490,11 @@ class XMLTreeBuilderSmokeTest(object):
         encoded = soup.encode()
         self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
 
+    def test_can_parse_unicode_document(self):
+        markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
+        soup = self.soup(markup)
+        self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
+
     def test_popping_namespaced_tag(self):
         markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
         soup = self.soup(markup)
author	Leonard Richardson <leonardr@segfault.org>	2013-05-31 09:17:11 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2013-05-31 09:17:11 -0400
commit	19f05a586c79b86be8ebe06a3728ab9a94162bee (patch)
tree	295326e49419a40a8942dc3b0552e51f97e18abb /bs4/testing.py
parent	342da7818966498e1fc2100c0b920cbc242c9831 (diff)