summaryrefslogtreecommitdiff
path: root/bs4/element.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2020-04-24 22:13:30 -0400
committerLeonard Richardson <leonardr@segfault.org>2020-04-24 22:13:30 -0400
commit197470f217ab994dd0ba8e143418a54d69df8523 (patch)
treef962eb0fa745c772f63617abfb9caaee2f7c233d /bs4/element.py
parente0d4d8de8f9608e3ac2d637544c786958b3d34b4 (diff)
If you encode a document with a Python-specific encoding like
'unicode_escape', that encoding is no longer mentioned in the final XML or HTML document. Instead, encoding information is omitted or left blank. [bug=1874955]
Diffstat (limited to 'bs4/element.py')
-rw-r--r--bs4/element.py33
1 files changed, 33 insertions, 0 deletions
diff --git a/bs4/element.py b/bs4/element.py
index 8c553cd..1744beb 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -43,6 +43,35 @@ def _alias(attr):
return alias
+# These encodings are recognized by Python (so PageElement.encode
+# could theoretically support them) but XML and HTML don't recognize
+# them (so they should not show up in an XML or HTML document as that
+# document's encoding).
+#
+# If an XML document is encoded in one of these encodings, no encoding
+# will be mentioned in the XML declaration. If an HTML document is
+# encoded in one of these encodings, and the HTML document has a
+# <meta> tag that mentions an encoding, the encoding will be given as
+# the empty string.
+#
+# Source:
+# https://docs.python.org/3/library/codecs.html#python-specific-encodings
+PYTHON_SPECIFIC_ENCODINGS = set([
+ u"idna",
+ u"mbcs",
+ u"oem",
+ u"palmos",
+ u"punycode",
+ u"raw_unicode_escape",
+ u"undefined",
+ u"unicode_escape",
+ u"raw-unicode-escape",
+ u"unicode-escape",
+ u"string-escape",
+ u"string_escape",
+])
+
+
class NamespacedAttribute(unicode):
"""A namespaced string (e.g. 'xml:lang') that remembers the namespace
('xml') and the name ('lang') that were used to create it.
@@ -85,6 +114,8 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
"""When an HTML document is being encoded to a given encoding, the
value of a meta tag's 'charset' is the name of the encoding.
"""
+ if encoding in PYTHON_SPECIFIC_ENCODINGS:
+ return ''
return encoding
@@ -110,6 +141,8 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return obj
def encode(self, encoding):
+ if encoding in PYTHON_SPECIFIC_ENCODINGS:
+ return ''
def rewrite(match):
return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value)