In an HTML document, the contents of a <script> or <style> tag will

no longer undergo entity substitution by default. XML documents work the same way they did before. [bug=1085953]
author: Leonard Richardson <leonardr@segfault.org> 2013-05-06 17:13:43 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2013-05-06 17:13:43 -0400
commit: 6bdd65ceb1177d0e5d50bbe51f21b1209c72b599 (patch)
tree: d5196eec6c5acf50e510e0f1cd929daaa8fbc652
parent: eac207a4efeca3e7a78838377aafb4a2b44f43c5 (diff)
3 files changed, 104 insertions, 10 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 3e9d015..9f8b3dc 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,4 +1,8 @@
-= 4.1.4 (Unreleased) =
+= 4.2.0 (Unreleased) =
+
+* In an HTML document, the contents of a <script> or <style> tag will
+  no longer undergo entity substitution by default. XML documents work
+  the same way they did before. [bug=1085953]
 
 * Fix a bug in the html5lib treebuilder which sometimes created
   disconnected trees. [bug=1039527]
diff --git a/bs4/element.py b/bs4/element.py
index 594ef78..bc6d9c8 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -81,6 +81,38 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
             return match.group(1) + encoding
         return self.CHARSET_RE.sub(rewrite, self.original_value)
 
+class HTMLAwareEntitySubstitution(EntitySubstitution):
+
+    """Entity substitution rules that are aware of some HTML quirks.
+
+    Specifically, the contents of <script> and <style> tags should not
+    undergo entity substitution.
+
+    Incoming NavigableString objects are checked to see if they're the
+    direct children of a <script> or <style> tag.
+    """
+
+    cdata_containing_tags = set(["script", "style"])
+
+    @classmethod
+    def _substitute_if_appropriate(cls, ns, f):
+        if (isinstance(ns, NavigableString)
+            and ns.parent is not None
+            and ns.parent.name in cls.cdata_containing_tags):
+            # Do nothing.
+            return ns
+        # Substitute.
+        return f(ns)
+
+    @classmethod
+    def substitute_html(cls, ns):
+        return cls._substitute_if_appropriate(
+            ns, EntitySubstitution.substitute_html)
+
+    @classmethod
+    def substitute_xml(cls, ns):
+        return cls._substitute_if_appropriate(
+            ns, EntitySubstitution.substitute_xml)
 
 class PageElement(object):
     """Contains the navigational information for some part of the page
@@ -97,25 +129,60 @@ class PageElement(object):
     #   converted to entities.  This is not recommended, but it's
     #   faster than "minimal".
     # A function - This function will be called on every string that
-    #  needs to undergo entity substition
-    FORMATTERS = {
+    #  needs to undergo entity substitution.
+    #
+
+    # In an HTML document, the default "html" and "minimal" functions
+    # will leave the contents of <script> and <style> tags alone. For
+    # an XML document, all tags will be given the same treatment.
+
+    HTML_FORMATTERS = {
+        "html" : HTMLAwareEntitySubstitution.substitute_html,
+        "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
+        None : None
+        }
+
+    XML_FORMATTERS = {
         "html" : EntitySubstitution.substitute_html,
         "minimal" : EntitySubstitution.substitute_xml,
         None : None
         }
 
-    @classmethod
     def format_string(self, s, formatter='minimal'):
         """Format the given string using the given formatter."""
         if not callable(formatter):
-            formatter = self.FORMATTERS.get(
-                formatter, EntitySubstitution.substitute_xml)
+            formatter = self._formatter_for_name(formatter)
         if formatter is None:
             output = s
         else:
             output = formatter(s)
         return output
 
+    @property
+    def _is_xml(self):
+        """Is this element part of an XML tree or an HTML tree?
+
+        This is used when mapping a formatter name ("minimal") to an
+        appropriate function (one that performs entity-substitution on
+        the contents of <script> and <style> tags, or not). It's
+        inefficient, but it should be called very rarely.
+        """
+        if self.parent is None:
+            # This is the top-level object. It should have .is_xml set
+            # from tree creation. If not, take a guess--BS is usually
+            # used on HTML markup.
+            return getattr(self, 'is_xml', False)
+        return self.parent._is_xml
+
+    def _formatter_for_name(self, name):
+        "Look up a formatter function based on its name and the tree."
+        if self._is_xml:
+            return self.XML_FORMATTERS.get(
+                name, EntitySubstitution.substitute_xml)
+        else:
+            return self.HTML_FORMATTERS.get(
+                name, HTMLAwareEntitySubstitution.substitute_xml)
+
     def setup(self, parent=None, previous_element=None):
         """Sets up the initial relations between this element and
         other elements."""
@@ -981,6 +1048,12 @@ class Tag(PageElement):
            document contains a <META> tag that mentions the document's
            encoding.
         """
+
+        # First off, turn a string formatter into a function. This
+        # will stop the lookup from happening over and over again.
+        if not callable(formatter):
+            formatter = self._formatter_for_name(formatter)
+
         attrs = []
         if self.attrs:
             for key, val in sorted(self.attrs.items()):
@@ -1066,6 +1139,11 @@ class Tag(PageElement):
            document contains a <META> tag that mentions the document's
            encoding.
         """
+        # First off, turn a string formatter into a function. This
+        # will stop the lookup from happening over and over again.
+        if not callable(formatter):
+            formatter = self._formatter_for_name(formatter)
+
         pretty_print = (indent_level is not None)
         s = []
         for c in self:
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 3e75fae..a4e2a8a 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1317,16 +1317,28 @@ class TestSubstitutions(SoupTest):
   </script>
 """
         encoded = BeautifulSoup(doc).encode()
-        self.assertTrue("< < hey > >" in encoded)
+        self.assertTrue(b"< < hey > >" in encoded)
+
+    def test_formatter_skips_style_tag_for_html_documents(self):
+        doc = """
+  <style type="text/css">
+   console.log("< < hey > > ");
+  </style>
+"""
+        encoded = BeautifulSoup(doc).encode()
+        self.assertTrue(b"< < hey > >" in encoded)
 
     def test_formatter_processes_script_tag_for_xml_documents(self):
         doc = """
   <script type="text/javascript">
-   console.log("< < hey > > ");
   </script>
 """
-        encoded = BeautifulSoup(doc).encode()
-        self.assertTrue("&lt; &lt; hey &gt; &gt;" in encoded)
+        soup = BeautifulSoup(doc, "xml")
+        # lxml would have stripped this while parsing, but we can add
+        # it later.
+        soup.script.string = 'console.log("< < hey > > ");'
+        encoded = soup.encode()
+        self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
 
     def test_prettify_accepts_formatter(self):
         soup = BeautifulSoup("<html><body>foo</body></html>")
author	Leonard Richardson <leonardr@segfault.org>	2013-05-06 17:13:43 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2013-05-06 17:13:43 -0400
commit	6bdd65ceb1177d0e5d50bbe51f21b1209c72b599 (patch)
tree	d5196eec6c5acf50e510e0f1cd929daaa8fbc652
parent	eac207a4efeca3e7a78838377aafb4a2b44f43c5 (diff)