If you encode a document with a Python-specific encoding like

'unicode_escape', that encoding is no longer mentioned in the final XML or HTML document. Instead, encoding information is omitted or left blank. [bug=1874955]
author: Leonard Richardson <leonardr@segfault.org> 2020-04-24 22:13:30 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2020-04-24 22:13:30 -0400
commit: 197470f217ab994dd0ba8e143418a54d69df8523 (patch)
tree: f962eb0fa745c772f63617abfb9caaee2f7c233d /bs4/testing.py
parent: e0d4d8de8f9608e3ac2d637544c786958b3d34b4 (diff)
1 files changed, 43 insertions, 0 deletions
diff --git a/bs4/testing.py b/bs4/testing.py
index 328bd56..660cccb 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -15,6 +15,7 @@ from bs4.element import (
     Comment,
     ContentMetaAttributeValue,
     Doctype,
+    PYTHON_SPECIFIC_ENCODINGS,
     SoupStrainer,
     Script,
     Stylesheet,
@@ -821,6 +822,29 @@ Hello, world!
         # encoding.
         self.assertEqual('utf8', charset.encode("utf8"))
 
+    def test_python_specific_encodings_not_used_in_charset(self):
+        # You can encode an HTML document using a Python-specific
+        # encoding, but that encoding won't be mentioned _inside_ the
+        # resulting document. Instead, the document will appear to
+        # have no encoding.
+        for markup in [
+            b'<meta charset="utf8"></head>'
+            b'<meta id="encoding" charset="utf-8" />'
+        ]:
+            soup = self.soup(markup)
+            for encoding in PYTHON_SPECIFIC_ENCODINGS:
+                if encoding in (
+                    u'idna', u'mbcs', u'oem', u'undefined',
+                    u'string_escape', u'string-escape'
+                ):
+                    # For one reason or another, these will raise an
+                    # exception if we actually try to use them, so don't
+                    # bother.
+                    continue
+                encoded = soup.encode(encoding)
+                assert b'meta charset=""' in encoded
+                assert encoding.encode("ascii") not in encoded
+        
     def test_tag_with_no_attributes_can_have_attributes_added(self):
         data = self.soup("<a>text</a>")
         data.a['foo'] = 'bar'
@@ -854,6 +878,25 @@ class XMLTreeBuilderSmokeTest(object):
         soup = self.soup(markup)
         self.assertEqual(markup, soup.encode("utf8"))
 
+    def test_python_specific_encodings_not_used_in_xml_declaration(self):
+        # You can encode an XML document using a Python-specific
+        # encoding, but that encoding won't be mentioned _inside_ the
+        # resulting document.
+        markup = b"""<?xml version="1.0"?>\n<foo/>"""
+        soup = self.soup(markup)
+        for encoding in PYTHON_SPECIFIC_ENCODINGS:
+            if encoding in (
+                u'idna', u'mbcs', u'oem', u'undefined',
+                u'string_escape', u'string-escape'
+            ):
+                # For one reason or another, these will raise an
+                # exception if we actually try to use them, so don't
+                # bother.
+                continue
+            encoded = soup.encode(encoding)
+            assert b'<?xml version="1.0"?>' in encoded
+            assert encoding.encode("ascii") not in encoded
+
     def test_processing_instruction(self):
         markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
         soup = self.soup(markup)
author	Leonard Richardson <leonardr@segfault.org>	2020-04-24 22:13:30 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2020-04-24 22:13:30 -0400
commit	197470f217ab994dd0ba8e143418a54d69df8523 (patch)
tree	f962eb0fa745c772f63617abfb9caaee2f7c233d /bs4/testing.py
parent	e0d4d8de8f9608e3ac2d637544c786958b3d34b4 (diff)