diff options
author | Leonard Richardson <leonardr@segfault.org> | 2020-04-24 22:13:30 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2020-04-24 22:13:30 -0400 |
commit | 197470f217ab994dd0ba8e143418a54d69df8523 (patch) | |
tree | f962eb0fa745c772f63617abfb9caaee2f7c233d /bs4/element.py | |
parent | e0d4d8de8f9608e3ac2d637544c786958b3d34b4 (diff) |
If you encode a document with a Python-specific encoding like
'unicode_escape', that encoding is no longer mentioned in the final
XML or HTML document. Instead, encoding information is omitted or
left blank. [bug=1874955]
Diffstat (limited to 'bs4/element.py')
-rw-r--r-- | bs4/element.py | 33 |
1 files changed, 33 insertions, 0 deletions
diff --git a/bs4/element.py b/bs4/element.py index 8c553cd..1744beb 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -43,6 +43,35 @@ def _alias(attr): return alias +# These encodings are recognized by Python (so PageElement.encode +# could theoretically support them) but XML and HTML don't recognize +# them (so they should not show up in an XML or HTML document as that +# document's encoding). +# +# If an XML document is encoded in one of these encodings, no encoding +# will be mentioned in the XML declaration. If an HTML document is +# encoded in one of these encodings, and the HTML document has a +# <meta> tag that mentions an encoding, the encoding will be given as +# the empty string. +# +# Source: +# https://docs.python.org/3/library/codecs.html#python-specific-encodings +PYTHON_SPECIFIC_ENCODINGS = set([ + u"idna", + u"mbcs", + u"oem", + u"palmos", + u"punycode", + u"raw_unicode_escape", + u"undefined", + u"unicode_escape", + u"raw-unicode-escape", + u"unicode-escape", + u"string-escape", + u"string_escape", +]) + + class NamespacedAttribute(unicode): """A namespaced string (e.g. 'xml:lang') that remembers the namespace ('xml') and the name ('lang') that were used to create it. @@ -85,6 +114,8 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): """When an HTML document is being encoded to a given encoding, the value of a meta tag's 'charset' is the name of the encoding. """ + if encoding in PYTHON_SPECIFIC_ENCODINGS: + return '' return encoding @@ -110,6 +141,8 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): return obj def encode(self, encoding): + if encoding in PYTHON_SPECIFIC_ENCODINGS: + return '' def rewrite(match): return match.group(1) + encoding return self.CHARSET_RE.sub(rewrite, self.original_value) |