diff options
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/__init__.py | 10 | ||||
-rw-r--r-- | bs4/element.py | 33 | ||||
-rw-r--r-- | bs4/testing.py | 43 |
3 files changed, 84 insertions, 2 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index 834f180..04dcffc 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -39,6 +39,7 @@ from .element import ( NavigableString, PageElement, ProcessingInstruction, + PYTHON_SPECIFIC_ENCODINGS, ResultSet, Script, Stylesheet, @@ -109,7 +110,7 @@ class BeautifulSoup(Tag): ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" - + def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, element_classes=None, **kwargs): @@ -697,7 +698,7 @@ class BeautifulSoup(Tag): def handle_data(self, data): """Called by the tree builder when a chunk of textual data is encountered.""" self.current_data.append(data) - + def decode(self, pretty_print=False, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): @@ -712,6 +713,11 @@ class BeautifulSoup(Tag): if self.is_xml: # Print the XML declaration encoding_part = '' + if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: + # This is a special Python encoding; it can't actually + # go into an XML document because it means nothing + # outside of Python. + eventual_encoding = None if eventual_encoding != None: encoding_part = ' encoding="%s"' % eventual_encoding prefix = u'<?xml version="1.0"%s?>\n' % encoding_part diff --git a/bs4/element.py b/bs4/element.py index 8c553cd..1744beb 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -43,6 +43,35 @@ def _alias(attr): return alias +# These encodings are recognized by Python (so PageElement.encode +# could theoretically support them) but XML and HTML don't recognize +# them (so they should not show up in an XML or HTML document as that +# document's encoding). +# +# If an XML document is encoded in one of these encodings, no encoding +# will be mentioned in the XML declaration. If an HTML document is +# encoded in one of these encodings, and the HTML document has a +# <meta> tag that mentions an encoding, the encoding will be given as +# the empty string. +# +# Source: +# https://docs.python.org/3/library/codecs.html#python-specific-encodings +PYTHON_SPECIFIC_ENCODINGS = set([ + u"idna", + u"mbcs", + u"oem", + u"palmos", + u"punycode", + u"raw_unicode_escape", + u"undefined", + u"unicode_escape", + u"raw-unicode-escape", + u"unicode-escape", + u"string-escape", + u"string_escape", +]) + + class NamespacedAttribute(unicode): """A namespaced string (e.g. 'xml:lang') that remembers the namespace ('xml') and the name ('lang') that were used to create it. @@ -85,6 +114,8 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): """When an HTML document is being encoded to a given encoding, the value of a meta tag's 'charset' is the name of the encoding. """ + if encoding in PYTHON_SPECIFIC_ENCODINGS: + return '' return encoding @@ -110,6 +141,8 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): return obj def encode(self, encoding): + if encoding in PYTHON_SPECIFIC_ENCODINGS: + return '' def rewrite(match): return match.group(1) + encoding return self.CHARSET_RE.sub(rewrite, self.original_value) diff --git a/bs4/testing.py b/bs4/testing.py index 328bd56..660cccb 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -15,6 +15,7 @@ from bs4.element import ( Comment, ContentMetaAttributeValue, Doctype, + PYTHON_SPECIFIC_ENCODINGS, SoupStrainer, Script, Stylesheet, @@ -821,6 +822,29 @@ Hello, world! # encoding. self.assertEqual('utf8', charset.encode("utf8")) + def test_python_specific_encodings_not_used_in_charset(self): + # You can encode an HTML document using a Python-specific + # encoding, but that encoding won't be mentioned _inside_ the + # resulting document. Instead, the document will appear to + # have no encoding. + for markup in [ + b'<meta charset="utf8"></head>' + b'<meta id="encoding" charset="utf-8" />' + ]: + soup = self.soup(markup) + for encoding in PYTHON_SPECIFIC_ENCODINGS: + if encoding in ( + u'idna', u'mbcs', u'oem', u'undefined', + u'string_escape', u'string-escape' + ): + # For one reason or another, these will raise an + # exception if we actually try to use them, so don't + # bother. + continue + encoded = soup.encode(encoding) + assert b'meta charset=""' in encoded + assert encoding.encode("ascii") not in encoded + def test_tag_with_no_attributes_can_have_attributes_added(self): data = self.soup("<a>text</a>") data.a['foo'] = 'bar' @@ -854,6 +878,25 @@ class XMLTreeBuilderSmokeTest(object): soup = self.soup(markup) self.assertEqual(markup, soup.encode("utf8")) + def test_python_specific_encodings_not_used_in_xml_declaration(self): + # You can encode an XML document using a Python-specific + # encoding, but that encoding won't be mentioned _inside_ the + # resulting document. + markup = b"""<?xml version="1.0"?>\n<foo/>""" + soup = self.soup(markup) + for encoding in PYTHON_SPECIFIC_ENCODINGS: + if encoding in ( + u'idna', u'mbcs', u'oem', u'undefined', + u'string_escape', u'string-escape' + ): + # For one reason or another, these will raise an + # exception if we actually try to use them, so don't + # bother. + continue + encoded = soup.encode(encoding) + assert b'<?xml version="1.0"?>' in encoded + assert encoding.encode("ascii") not in encoded + def test_processing_instruction(self): markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>""" soup = self.soup(markup) |