summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt6
-rw-r--r--bs4/element.py88
-rw-r--r--bs4/tests/test_tree.py20
3 files changed, 104 insertions, 10 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 3e9d015..9f8b3dc 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,4 +1,8 @@
-= 4.1.4 (Unreleased) =
+= 4.2.0 (Unreleased) =
+
+* In an HTML document, the contents of a <script> or <style> tag will
+ no longer undergo entity substitution by default. XML documents work
+ the same way they did before. [bug=1085953]
* Fix a bug in the html5lib treebuilder which sometimes created
disconnected trees. [bug=1039527]
diff --git a/bs4/element.py b/bs4/element.py
index 594ef78..bc6d9c8 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -81,6 +81,38 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value)
+class HTMLAwareEntitySubstitution(EntitySubstitution):
+
+ """Entity substitution rules that are aware of some HTML quirks.
+
+ Specifically, the contents of <script> and <style> tags should not
+ undergo entity substitution.
+
+ Incoming NavigableString objects are checked to see if they're the
+ direct children of a <script> or <style> tag.
+ """
+
+ cdata_containing_tags = set(["script", "style"])
+
+ @classmethod
+ def _substitute_if_appropriate(cls, ns, f):
+ if (isinstance(ns, NavigableString)
+ and ns.parent is not None
+ and ns.parent.name in cls.cdata_containing_tags):
+ # Do nothing.
+ return ns
+ # Substitute.
+ return f(ns)
+
+ @classmethod
+ def substitute_html(cls, ns):
+ return cls._substitute_if_appropriate(
+ ns, EntitySubstitution.substitute_html)
+
+ @classmethod
+ def substitute_xml(cls, ns):
+ return cls._substitute_if_appropriate(
+ ns, EntitySubstitution.substitute_xml)
class PageElement(object):
"""Contains the navigational information for some part of the page
@@ -97,25 +129,60 @@ class PageElement(object):
# converted to entities. This is not recommended, but it's
# faster than "minimal".
# A function - This function will be called on every string that
- # needs to undergo entity substition
- FORMATTERS = {
+ # needs to undergo entity substitution.
+ #
+
+ # In an HTML document, the default "html" and "minimal" functions
+ # will leave the contents of <script> and <style> tags alone. For
+ # an XML document, all tags will be given the same treatment.
+
+ HTML_FORMATTERS = {
+ "html" : HTMLAwareEntitySubstitution.substitute_html,
+ "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
+ None : None
+ }
+
+ XML_FORMATTERS = {
"html" : EntitySubstitution.substitute_html,
"minimal" : EntitySubstitution.substitute_xml,
None : None
}
- @classmethod
def format_string(self, s, formatter='minimal'):
"""Format the given string using the given formatter."""
if not callable(formatter):
- formatter = self.FORMATTERS.get(
- formatter, EntitySubstitution.substitute_xml)
+ formatter = self._formatter_for_name(formatter)
if formatter is None:
output = s
else:
output = formatter(s)
return output
+ @property
+ def _is_xml(self):
+ """Is this element part of an XML tree or an HTML tree?
+
+ This is used when mapping a formatter name ("minimal") to an
+ appropriate function (one that performs entity-substitution on
+ the contents of <script> and <style> tags, or not). It's
+ inefficient, but it should be called very rarely.
+ """
+ if self.parent is None:
+ # This is the top-level object. It should have .is_xml set
+ # from tree creation. If not, take a guess--BS is usually
+ # used on HTML markup.
+ return getattr(self, 'is_xml', False)
+ return self.parent._is_xml
+
+ def _formatter_for_name(self, name):
+ "Look up a formatter function based on its name and the tree."
+ if self._is_xml:
+ return self.XML_FORMATTERS.get(
+ name, EntitySubstitution.substitute_xml)
+ else:
+ return self.HTML_FORMATTERS.get(
+ name, HTMLAwareEntitySubstitution.substitute_xml)
+
def setup(self, parent=None, previous_element=None):
"""Sets up the initial relations between this element and
other elements."""
@@ -981,6 +1048,12 @@ class Tag(PageElement):
document contains a <META> tag that mentions the document's
encoding.
"""
+
+ # First off, turn a string formatter into a function. This
+ # will stop the lookup from happening over and over again.
+ if not callable(formatter):
+ formatter = self._formatter_for_name(formatter)
+
attrs = []
if self.attrs:
for key, val in sorted(self.attrs.items()):
@@ -1066,6 +1139,11 @@ class Tag(PageElement):
document contains a <META> tag that mentions the document's
encoding.
"""
+ # First off, turn a string formatter into a function. This
+ # will stop the lookup from happening over and over again.
+ if not callable(formatter):
+ formatter = self._formatter_for_name(formatter)
+
pretty_print = (indent_level is not None)
s = []
for c in self:
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 3e75fae..a4e2a8a 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1317,16 +1317,28 @@ class TestSubstitutions(SoupTest):
</script>
"""
encoded = BeautifulSoup(doc).encode()
- self.assertTrue("< < hey > >" in encoded)
+ self.assertTrue(b"< < hey > >" in encoded)
+
+ def test_formatter_skips_style_tag_for_html_documents(self):
+ doc = """
+ <style type="text/css">
+ console.log("< < hey > > ");
+ </style>
+"""
+ encoded = BeautifulSoup(doc).encode()
+ self.assertTrue(b"< < hey > >" in encoded)
def test_formatter_processes_script_tag_for_xml_documents(self):
doc = """
<script type="text/javascript">
- console.log("< < hey > > ");
</script>
"""
- encoded = BeautifulSoup(doc).encode()
- self.assertTrue("&lt; &lt; hey &gt; &gt;" in encoded)
+ soup = BeautifulSoup(doc, "xml")
+ # lxml would have stripped this while parsing, but we can add
+ # it later.
+ soup.script.string = 'console.log("< < hey > > ");'
+ encoded = soup.encode()
+ self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
def test_prettify_accepts_formatter(self):
soup = BeautifulSoup("<html><body>foo</body></html>")