If you encode a document with a Python-specific encoding like

'unicode_escape', that encoding is no longer mentioned in the final XML or HTML document. Instead, encoding information is omitted or left blank. [bug=1874955]
author: Leonard Richardson <leonardr@segfault.org> 2020-04-24 22:13:30 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2020-04-24 22:13:30 -0400
commit: 197470f217ab994dd0ba8e143418a54d69df8523 (patch)
tree: f962eb0fa745c772f63617abfb9caaee2f7c233d /bs4/element.py
parent: e0d4d8de8f9608e3ac2d637544c786958b3d34b4 (diff)
1 files changed, 33 insertions, 0 deletions
diff --git a/bs4/element.py b/bs4/element.py
index 8c553cd..1744beb 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -43,6 +43,35 @@ def _alias(attr):
     return alias
 
 
+# These encodings are recognized by Python (so PageElement.encode
+# could theoretically support them) but XML and HTML don't recognize
+# them (so they should not show up in an XML or HTML document as that
+# document's encoding).
+#
+# If an XML document is encoded in one of these encodings, no encoding
+# will be mentioned in the XML declaration. If an HTML document is
+# encoded in one of these encodings, and the HTML document has a
+# <meta> tag that mentions an encoding, the encoding will be given as
+# the empty string.
+#
+# Source:
+# https://docs.python.org/3/library/codecs.html#python-specific-encodings
+PYTHON_SPECIFIC_ENCODINGS = set([
+    u"idna",
+    u"mbcs",
+    u"oem",
+    u"palmos",
+    u"punycode",
+    u"raw_unicode_escape",
+    u"undefined",
+    u"unicode_escape",
+    u"raw-unicode-escape",
+    u"unicode-escape",
+    u"string-escape",
+    u"string_escape",
+])
+    
+
 class NamespacedAttribute(unicode):
     """A namespaced string (e.g. 'xml:lang') that remembers the namespace
     ('xml') and the name ('lang') that were used to create it.
@@ -85,6 +114,8 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
         """When an HTML document is being encoded to a given encoding, the
         value of a meta tag's 'charset' is the name of the encoding.
         """
+        if encoding in PYTHON_SPECIFIC_ENCODINGS:
+            return ''
         return encoding
 
 
@@ -110,6 +141,8 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
         return obj
 
     def encode(self, encoding):
+        if encoding in PYTHON_SPECIFIC_ENCODINGS:
+            return ''
         def rewrite(match):
             return match.group(1) + encoding
         return self.CHARSET_RE.sub(rewrite, self.original_value)
author	Leonard Richardson <leonardr@segfault.org>	2020-04-24 22:13:30 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2020-04-24 22:13:30 -0400
commit	197470f217ab994dd0ba8e143418a54d69df8523 (patch)
tree	f962eb0fa745c772f63617abfb9caaee2f7c233d /bs4/element.py
parent	e0d4d8de8f9608e3ac2d637544c786958b3d34b4 (diff)