diff options
author | Leonard Richardson <leonardr@segfault.org> | 2013-05-06 17:13:43 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2013-05-06 17:13:43 -0400 |
commit | 6bdd65ceb1177d0e5d50bbe51f21b1209c72b599 (patch) | |
tree | d5196eec6c5acf50e510e0f1cd929daaa8fbc652 | |
parent | eac207a4efeca3e7a78838377aafb4a2b44f43c5 (diff) |
In an HTML document, the contents of a <script> or <style> tag will
no longer undergo entity substitution by default. XML documents work
the same way they did before. [bug=1085953]
-rw-r--r-- | NEWS.txt | 6 | ||||
-rw-r--r-- | bs4/element.py | 88 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 20 |
3 files changed, 104 insertions, 10 deletions
@@ -1,4 +1,8 @@ -= 4.1.4 (Unreleased) = += 4.2.0 (Unreleased) = + +* In an HTML document, the contents of a <script> or <style> tag will + no longer undergo entity substitution by default. XML documents work + the same way they did before. [bug=1085953] * Fix a bug in the html5lib treebuilder which sometimes created disconnected trees. [bug=1039527] diff --git a/bs4/element.py b/bs4/element.py index 594ef78..bc6d9c8 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -81,6 +81,38 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): return match.group(1) + encoding return self.CHARSET_RE.sub(rewrite, self.original_value) +class HTMLAwareEntitySubstitution(EntitySubstitution): + + """Entity substitution rules that are aware of some HTML quirks. + + Specifically, the contents of <script> and <style> tags should not + undergo entity substitution. + + Incoming NavigableString objects are checked to see if they're the + direct children of a <script> or <style> tag. + """ + + cdata_containing_tags = set(["script", "style"]) + + @classmethod + def _substitute_if_appropriate(cls, ns, f): + if (isinstance(ns, NavigableString) + and ns.parent is not None + and ns.parent.name in cls.cdata_containing_tags): + # Do nothing. + return ns + # Substitute. + return f(ns) + + @classmethod + def substitute_html(cls, ns): + return cls._substitute_if_appropriate( + ns, EntitySubstitution.substitute_html) + + @classmethod + def substitute_xml(cls, ns): + return cls._substitute_if_appropriate( + ns, EntitySubstitution.substitute_xml) class PageElement(object): """Contains the navigational information for some part of the page @@ -97,25 +129,60 @@ class PageElement(object): # converted to entities. This is not recommended, but it's # faster than "minimal". # A function - This function will be called on every string that - # needs to undergo entity substition - FORMATTERS = { + # needs to undergo entity substitution. + # + + # In an HTML document, the default "html" and "minimal" functions + # will leave the contents of <script> and <style> tags alone. For + # an XML document, all tags will be given the same treatment. + + HTML_FORMATTERS = { + "html" : HTMLAwareEntitySubstitution.substitute_html, + "minimal" : HTMLAwareEntitySubstitution.substitute_xml, + None : None + } + + XML_FORMATTERS = { "html" : EntitySubstitution.substitute_html, "minimal" : EntitySubstitution.substitute_xml, None : None } - @classmethod def format_string(self, s, formatter='minimal'): """Format the given string using the given formatter.""" if not callable(formatter): - formatter = self.FORMATTERS.get( - formatter, EntitySubstitution.substitute_xml) + formatter = self._formatter_for_name(formatter) if formatter is None: output = s else: output = formatter(s) return output + @property + def _is_xml(self): + """Is this element part of an XML tree or an HTML tree? + + This is used when mapping a formatter name ("minimal") to an + appropriate function (one that performs entity-substitution on + the contents of <script> and <style> tags, or not). It's + inefficient, but it should be called very rarely. + """ + if self.parent is None: + # This is the top-level object. It should have .is_xml set + # from tree creation. If not, take a guess--BS is usually + # used on HTML markup. + return getattr(self, 'is_xml', False) + return self.parent._is_xml + + def _formatter_for_name(self, name): + "Look up a formatter function based on its name and the tree." + if self._is_xml: + return self.XML_FORMATTERS.get( + name, EntitySubstitution.substitute_xml) + else: + return self.HTML_FORMATTERS.get( + name, HTMLAwareEntitySubstitution.substitute_xml) + def setup(self, parent=None, previous_element=None): """Sets up the initial relations between this element and other elements.""" @@ -981,6 +1048,12 @@ class Tag(PageElement): document contains a <META> tag that mentions the document's encoding. """ + + # First off, turn a string formatter into a function. This + # will stop the lookup from happening over and over again. + if not callable(formatter): + formatter = self._formatter_for_name(formatter) + attrs = [] if self.attrs: for key, val in sorted(self.attrs.items()): @@ -1066,6 +1139,11 @@ class Tag(PageElement): document contains a <META> tag that mentions the document's encoding. """ + # First off, turn a string formatter into a function. This + # will stop the lookup from happening over and over again. + if not callable(formatter): + formatter = self._formatter_for_name(formatter) + pretty_print = (indent_level is not None) s = [] for c in self: diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 3e75fae..a4e2a8a 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1317,16 +1317,28 @@ class TestSubstitutions(SoupTest): </script> """ encoded = BeautifulSoup(doc).encode() - self.assertTrue("< < hey > >" in encoded) + self.assertTrue(b"< < hey > >" in encoded) + + def test_formatter_skips_style_tag_for_html_documents(self): + doc = """ + <style type="text/css"> + console.log("< < hey > > "); + </style> +""" + encoded = BeautifulSoup(doc).encode() + self.assertTrue(b"< < hey > >" in encoded) def test_formatter_processes_script_tag_for_xml_documents(self): doc = """ <script type="text/javascript"> - console.log("< < hey > > "); </script> """ - encoded = BeautifulSoup(doc).encode() - self.assertTrue("< < hey > >" in encoded) + soup = BeautifulSoup(doc, "xml") + # lxml would have stripped this while parsing, but we can add + # it later. + soup.script.string = 'console.log("< < hey > > ");' + encoded = soup.encode() + self.assertTrue(b"< < hey > >" in encoded) def test_prettify_accepts_formatter(self): soup = BeautifulSoup("<html><body>foo</body></html>") |