Embedded CSS and Javascript is now stored in distinct Stylesheet and

Script tags, which are ignored by methods like get_text(). This feature is not supported by the html5lib treebuilder. [bug=1868861]
author: Leonard Richardson <leonardr@segfault.org> 2020-04-05 15:43:58 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2020-04-05 15:43:58 -0400
commit: a6f897b213bb08f0d8d8a1528937541c280abbd6 (patch)
tree: 866d3392a854ea27a172e9b456b2160307e39363
parent: ddadf13ef66122d75eadaf7f10e0937429e6a3a6 (diff)
11 files changed, 223 insertions, 27 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 5852a12..1c7d57d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,9 +1,13 @@
-= 4.9.0 (Unreleased)
+= 4.9.0 (20200405)
 
 * Added PageElement.decomposed, a new property which lets you
   check whether you've already called decompose() on a Tag or
   NavigableString.
 
+* Embedded CSS and Javascript is now stored in distinct Stylesheet and
+  Script tags, which are ignored by methods like get_text(). This
+  feature is not supported by the html5lib treebuilder. [bug=1868861]
+
 * Added a Russian translation by 'authoress' to the repository.
 
 * Fixed an unhandled exception when formatting a Tag that had been
diff --git a/bs4/__init__.py b/bs4/__init__.py
index f828cd2..bae7fda 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -15,8 +15,8 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 """
 
 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.8.2"
-__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
+__version__ = "4.9.0"
+__copyright__ = "Copyright (c) 2004-2020 Leonard Richardson"
 # Use of this source code is governed by the MIT license.
 __license__ = "MIT"
 
@@ -423,6 +423,7 @@ class BeautifulSoup(Tag):
         self.currentTag = None
         self.tagStack = []
         self.preserve_whitespace_tag_stack = []
+        self.string_container_stack = []
         self.pushTag(self)
 
     def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
@@ -434,14 +435,28 @@ class BeautifulSoup(Tag):
             sourceline=sourceline, sourcepos=sourcepos
         )
 
+    def string_container(self, base_class=None):
+        container = base_class or NavigableString
+        
+        # There may be a general override of NavigableString.
+        container = self.element_classes.get(
+            container, container
+        )
+
+        # On top of that, we may be inside a tag that needs a special
+        # container class.
+        if self.string_container_stack:
+            container = self.builder.string_containers.get(
+                self.string_container_stack[-1].name, container
+            )
+        return container
+        
     def new_string(self, s, subclass=None):
         """Create a new NavigableString associated with this BeautifulSoup
         object.
         """
-        subclass = subclass or self.element_classes.get(
-            NavigableString, NavigableString
-        )
-        return subclass(s)
+        container = self.string_container(subclass)
+        return container(s)
 
     def insert_before(self, successor):
         """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
@@ -460,6 +475,8 @@ class BeautifulSoup(Tag):
         tag = self.tagStack.pop()
         if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
             self.preserve_whitespace_tag_stack.pop()
+        if self.string_container_stack and tag == self.string_container_stack[-1]:
+            self.string_container_stack.pop()
         #print "Pop", tag.name
         if self.tagStack:
             self.currentTag = self.tagStack[-1]
@@ -474,19 +491,14 @@ class BeautifulSoup(Tag):
         self.currentTag = self.tagStack[-1]
         if tag.name in self.builder.preserve_whitespace_tags:
             self.preserve_whitespace_tag_stack.append(tag)
+        if tag.name in self.builder.string_containers:
+            self.string_container_stack.append(tag)
 
     def endData(self, containerClass=None):
         """Method called by the TreeBuilder when the end of a data segment
         occurs.
         """
-        # Default container is NavigableString.
-        containerClass = containerClass or NavigableString
-
-        # The user may want us to instantiate some alias for the
-        # container class.
-        containerClass = self.element_classes.get(
-            containerClass, containerClass
-        )
+        containerClass = self.string_container(containerClass)
         
         if self.current_data:
             current_data = u''.join(self.current_data)
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index e8d78f9..7d3a6eb 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -7,8 +7,11 @@ import sys
 from bs4.element import (
     CharsetMetaAttributeValue,
     ContentMetaAttributeValue,
+    Stylesheet,
+    Script,
+    TemplateString,
     nonwhitespace_re
-    )
+)
 
 __all__ = [
     'HTMLTreeBuilder',
@@ -111,7 +114,12 @@ class TreeBuilder(object):
     # comma-separated list of CDATA, rather than a single CDATA.
     DEFAULT_CDATA_LIST_ATTRIBUTES = {}
 
+    # Whitespace should be preserved inside these tags.
     DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
+
+    # The textual contents of tags with these names should be
+    # instantiated with some class other than NavigableString.
+    DEFAULT_STRING_CONTAINERS = {}
     
     USE_DEFAULT = object()
 
@@ -120,12 +128,14 @@ class TreeBuilder(object):
     
     def __init__(self, multi_valued_attributes=USE_DEFAULT,
                  preserve_whitespace_tags=USE_DEFAULT,
-                 store_line_numbers=USE_DEFAULT):
+                 store_line_numbers=USE_DEFAULT,
+                 string_containers=USE_DEFAULT,
+    ):
         """Constructor.
 
         :param multi_valued_attributes: If this is set to None, the
          TreeBuilder will not turn any values for attributes like
-         'class' into lists. Setting this do a dictionary will
+         'class' into lists. Setting this to a dictionary will
          customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
          for an example.
 
@@ -138,6 +148,12 @@ class TreeBuilder(object):
          are immune from pretty-printing; their contents will always be
          output as-is.
 
+        :param string_containers: A dictionary mapping tag names to
+        the classes that should be instantiated to contain the textual
+        contents of those tags. The default is to use NavigableString
+        for every tag, no matter what the name. You can override the
+        default by changing DEFAULT_STRING_CONTAINERS.
+
         :param store_line_numbers: If the parser keeps track of the
          line numbers and positions of the original markup, that
          information will, by default, be stored in each corresponding
@@ -155,7 +171,10 @@ class TreeBuilder(object):
         self.preserve_whitespace_tags = preserve_whitespace_tags
         if store_line_numbers == self.USE_DEFAULT:
             store_line_numbers = self.TRACKS_LINE_NUMBERS
-        self.store_line_numbers = store_line_numbers
+        self.store_line_numbers = store_line_numbers 
+        if string_containers == self.USE_DEFAULT:
+            string_containers = self.DEFAULT_STRING_CONTAINERS
+        self.string_containers = string_containers
         
     def initialize_soup(self, soup):
         """The BeautifulSoup object has been initialized and is now
@@ -369,6 +388,22 @@ class HTMLTreeBuilder(TreeBuilder):
     # but it may do so eventually, and this information is available if
     # you need to use it.
     block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
+
+    # The HTML standard defines an unusual content model for these tags.
+    # We represent this by using a string class other than NavigableString
+    # inside these tags.
+    #
+    # I made this list by going through the HTML spec
+    # (https://html.spec.whatwg.org/#metadata-content) and looking for
+    # "metadata content" elements that can contain strings.
+    #
+    # TODO: Arguably <noscript> could go here but it seems
+    # qualitatively different from the other tags.
+    DEFAULT_STRING_CONTAINERS = {
+        'style': Stylesheet,
+        'script': Script,
+        'template': TemplateString,
+    }    
     
     # The HTML standard defines these attributes as containing a
     # space-separated list of values, not a single value. That is,
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 32a0856..b36189d 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -39,7 +39,18 @@ except ImportError, e:
     new_html5lib = True
 
 class HTML5TreeBuilder(HTMLTreeBuilder):
-    """Use html5lib to build a tree."""
+    """Use html5lib to build a tree.
+
+    Note that this TreeBuilder does not support some features common
+    to HTML TreeBuilders. Some of these features could theoretically
+    be implemented, but at the very least it's quite difficult,
+    because html5lib moves the parse tree around as it's being built.
+
+    * This TreeBuilder doesn't use different subclasses of NavigableString
+      based on the name of the tag in which the string was found.
+
+    * You can't use a SoupStrainer to parse only part of a document.
+    """
 
     NAME = "html5lib"
 
@@ -116,6 +127,9 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
                 "", "html.parser", store_line_numbers=store_line_numbers,
                 **kwargs
             )
+        # TODO: What are **kwargs exactly? Should they be passed in
+        # here in addition to/instead of being passed to the BeautifulSoup
+        # constructor?
         super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
 
         # This will be set later to an html5lib.html5parser.HTMLParser
diff --git a/bs4/element.py b/bs4/element.py
index e403839..8c553cd 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -992,6 +992,33 @@ class Doctype(PreformattedString):
     SUFFIX = u'>\n'
 
 
+class Stylesheet(NavigableString):
+    """A NavigableString representing an stylesheet (probably
+    CSS).
+
+    Used to distinguish embedded stylesheets from textual content.
+    """
+    pass
+
+    
+class Script(NavigableString):
+    """A NavigableString representing an executable script (probably
+    Javascript).
+
+    Used to distinguish executable code from textual content.
+    """
+    pass
+
+
+class TemplateString(NavigableString):
+    """A NavigableString representing a string found inside an HTML
+    template embedded in a larger document.
+
+    Used to distinguish such strings from the main body of the document.
+    """
+    pass
+
+
 class Tag(PageElement):
     """Represents an HTML or XML tag that is part of a parse tree, along
     with its attributes and contents.
@@ -1211,7 +1238,7 @@ class Tag(PageElement):
             a subclass not found in this list will be ignored. By
             default, this means only NavigableString and CData objects
             will be considered. So no comments, processing instructions,
-            etc.
+            stylesheets, etc.
 
         :return: A string.
         """
diff --git a/bs4/testing.py b/bs4/testing.py
index a162778..328bd56 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -16,6 +16,8 @@ from bs4.element import (
     ContentMetaAttributeValue,
     Doctype,
     SoupStrainer,
+    Script,
+    Stylesheet,
     Tag
 )
 
@@ -233,6 +235,22 @@ class HTMLTreeBuilderSmokeTest(object):
             new_tag = soup.new_tag(name)
             self.assertEqual(True, new_tag.is_empty_element)
 
+    def test_special_string_containers(self):
+        soup = self.soup(
+            "<style>Some CSS</style><script>Some Javascript</script>"
+        )
+        assert isinstance(soup.style.string, Stylesheet)
+        assert isinstance(soup.script.string, Script)
+
+        soup = self.soup(
+            "<style><!--Some CSS--></style>"
+        )
+        assert isinstance(soup.style.string, Stylesheet)
+        # The contents of the style tag resemble an HTML comment, but
+        # it's not treated as a comment.
+        self.assertEqual("<!--Some CSS-->", soup.style.string)
+        assert isinstance(soup.style.string, Stylesheet)
+        
     def test_pickle_and_unpickle_identity(self):
         # Pickling a tree, then unpickling it, yields a tree identical
         # to the original.
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index 6446f84..7b0a6d4 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -182,3 +182,9 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
         soup = self.soup(markup, store_line_numbers=False)
         self.assertEqual("sourceline", soup.p.sourceline.name)
         self.assertEqual("sourcepos", soup.p.sourcepos.name)
+
+    def test_special_string_containers(self):
+        # The html5lib tree builder doesn't support this standard feature,
+        # because there's no way of knowing, when a string is created,
+        # where in the tree it will eventually end up.
+        pass
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index dc88662..8d0583c 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -73,6 +73,7 @@ class TestConstructor(SoupTest):
                 self.store_line_numbers = False
                 self.cdata_list_attributes = []
                 self.preserve_whitespace_tags = []
+                self.string_containers = {}
             def initialize_soup(self, soup):
                 pass
             def feed(self, markup):
@@ -186,7 +187,41 @@ class TestConstructor(SoupTest):
             isinstance(x, (TagPlus, StringPlus, CommentPlus))
             for x in soup.recursiveChildGenerator()
         )
+
+    def test_alternate_string_containers(self):
+        # Test the ability to customize the string containers for
+        # different types of tags.
+        class PString(NavigableString):
+            pass
+
+        class BString(NavigableString):
+            pass
+
+        soup = self.soup(
+            "<div>Hello.<p>Here is <b>some <i>bolded</i></b> text",
+            string_containers = {
+                'b': BString,
+                'p': PString,
+            }
+        )
+
+        # The string before the <p> tag is a regular NavigableString.
+        assert isinstance(soup.div.contents[0], NavigableString)
         
+        # The string inside the <p> tag, but not inside the <i> tag,
+        # is a PString.
+        assert isinstance(soup.p.contents[0], PString)
+
+        # Every string inside the <b> tag is a BString, even the one that
+        # was also inside an <i> tag.
+        for s in soup.b.strings:
+            assert isinstance(s, BString)
+
+        # Now that parsing was complete, the string_container_stack
+        # (where this information was kept) has been cleared out.
+        self.assertEqual([], soup.string_container_stack)
+
+
 class TestWarnings(SoupTest):
 
     def _no_parser_specified(self, s, is_there=True):
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 80aaaff..7ecab9e 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -27,8 +27,11 @@ from bs4.element import (
     Doctype,
     Formatter,
     NavigableString,
+    Script,
     SoupStrainer,
+    Stylesheet,
     Tag,
+    TemplateString,
 )
 from bs4.testing import (
     SoupTest,
@@ -1408,7 +1411,7 @@ class TestElementObjects(SoupTest):
         self.assertEqual(soup.a.get_text(","), "a,r, , t ")
         self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
 
-    def test_get_text_ignores_comments(self):
+    def test_get_text_ignores_special_string_containers(self):
         soup = self.soup("foo<!--IGNORE-->bar")
         self.assertEqual(soup.get_text(), "foobar")
 
@@ -1417,10 +1420,17 @@ class TestElementObjects(SoupTest):
         self.assertEqual(
             soup.get_text(types=None), "fooIGNOREbar")
 
-    def test_all_strings_ignores_comments(self):
+        soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
+        self.assertEqual(soup.get_text(), "foobar")
+        
+    def test_all_strings_ignores_special_string_containers(self):
         soup = self.soup("foo<!--IGNORE-->bar")
         self.assertEqual(['foo', 'bar'], list(soup.strings))
 
+        soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
+        self.assertEqual(['foo', 'bar'], list(soup.strings))
+
+
 class TestCDAtaListAttributes(SoupTest):
 
     """Testing cdata-list attributes like 'class'.
@@ -1874,6 +1884,31 @@ class TestNavigableStringSubclasses(SoupTest):
         d = Declaration("foo")
         self.assertEqual("<?foo?>", d.output_ready())
 
+    def test_default_string_containers(self):
+        # In some cases, we use different NavigableString subclasses for
+        # the same text in different tags.
+        soup = self.soup(
+            "<div>text</div><script>text</script><style>text</style>"
+        )
+        self.assertEqual(
+            [NavigableString, Script, Stylesheet],
+            [x.__class__ for x in soup.find_all(text=True)]
+        )
+
+        # The TemplateString is a little unusual because it's generally found
+        # _inside_ children of a <template> element, not a direct child of the
+        # <template> element.
+        soup = self.soup(
+            "<template>Some text<p>In a tag</p></template>Some text outside"
+        )
+        assert all(isinstance(x, TemplateString) for x in soup.template.strings)
+
+        # Once the <template> tag closed, we went back to using
+        # NavigableString.
+        outside = soup.template.next_sibling
+        assert isinstance(outside, NavigableString)
+        assert not isinstance(outside, TemplateString)
+
 class TestSoupSelector(TreeTest):
 
     HTML = """
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 15c86b9..a233e89 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -540,7 +540,7 @@ Comments and other special strings
 
 ``Tag``, ``NavigableString``, and ``BeautifulSoup`` cover almost
 everything you'll see in an HTML or XML file, but there are a few
-leftover bits. The only one you'll probably ever need to worry about
+leftover bits. The main one you'll probably encounter
 is the comment::
 
  markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
@@ -562,9 +562,19 @@ displayed with special formatting::
  #  <!--Hey, buddy. Want to buy a used parser?-->
  # </b>
 
+Beautiful Soup also defines classes called ``Stylesheet``, ``Script``,
+and ``TemplateString``, for embedded CSS stylesheets (any strings
+found inside a ``<style>`` tag), embedded Javascript (any strings
+found in a ``<script>`` tag), and HTML templates (any strings inside a
+``<template>`` tag). These classes work exactly the same way as
+``NavigableString``; their only purpose is to make it easier to pick
+out the main body of the page, by ignoring strings that represent
+something else. (These classes are new in Beautiful Soup 4.9.0, and
+the html5lib parser doesn't use them.)
+ 
 Beautiful Soup defines classes for anything else that might show up in
 an XML document: ``CData``, ``ProcessingInstruction``,
-``Declaration``, and ``Doctype``. Just like ``Comment``, these classes
+``Declaration``, and ``Doctype``. Like ``Comment``, these classes
 are subclasses of ``NavigableString`` that add something extra to the
 string. Here's an example that replaces the comment with a CDATA
 block::
@@ -577,7 +587,7 @@ block::
  # <b>
  #  <![CDATA[A CDATA block]]>
  # </b>
-
+ 
 
 Navigating the tree
 ===================
diff --git a/setup.py b/setup.py
index 23af491..31c4541 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@ setup(
     # NOTE: We can't import __version__ from bs4 because bs4/__init__.py is Python 2 code,
     # and converting it to Python 3 means going through this code to run 2to3.
     # So we have to specify it twice for the time being.
-    version = '4.8.2',
+    version = '4.9.0',
     author="Leonard Richardson",
     author_email='leonardr@segfault.org',
     url="http://www.crummy.com/software/BeautifulSoup/bs4/",
author	Leonard Richardson <leonardr@segfault.org>	2020-04-05 15:43:58 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2020-04-05 15:43:58 -0400
commit	a6f897b213bb08f0d8d8a1528937541c280abbd6 (patch)
tree	866d3392a854ea27a172e9b456b2160307e39363
parent	ddadf13ef66122d75eadaf7f10e0937429e6a3a6 (diff)