diff options
author | Leonard Richardson <leonardr@segfault.org> | 2020-04-05 15:43:58 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2020-04-05 15:43:58 -0400 |
commit | a6f897b213bb08f0d8d8a1528937541c280abbd6 (patch) | |
tree | 866d3392a854ea27a172e9b456b2160307e39363 | |
parent | ddadf13ef66122d75eadaf7f10e0937429e6a3a6 (diff) |
Embedded CSS and Javascript is now stored in distinct Stylesheet and
Script tags, which are ignored by methods like get_text(). This
feature is not supported by the html5lib treebuilder. [bug=1868861]
-rw-r--r-- | CHANGELOG | 6 | ||||
-rw-r--r-- | bs4/__init__.py | 40 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 43 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 16 | ||||
-rw-r--r-- | bs4/element.py | 29 | ||||
-rw-r--r-- | bs4/testing.py | 18 | ||||
-rw-r--r-- | bs4/tests/test_html5lib.py | 6 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 35 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 39 | ||||
-rw-r--r-- | doc/source/index.rst | 16 | ||||
-rw-r--r-- | setup.py | 2 |
11 files changed, 223 insertions, 27 deletions
@@ -1,9 +1,13 @@ -= 4.9.0 (Unreleased) += 4.9.0 (20200405) * Added PageElement.decomposed, a new property which lets you check whether you've already called decompose() on a Tag or NavigableString. +* Embedded CSS and Javascript is now stored in distinct Stylesheet and + Script tags, which are ignored by methods like get_text(). This + feature is not supported by the html5lib treebuilder. [bug=1868861] + * Added a Russian translation by 'authoress' to the repository. * Fixed an unhandled exception when formatting a Tag that had been diff --git a/bs4/__init__.py b/bs4/__init__.py index f828cd2..bae7fda 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -15,8 +15,8 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.8.2" -__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson" +__version__ = "4.9.0" +__copyright__ = "Copyright (c) 2004-2020 Leonard Richardson" # Use of this source code is governed by the MIT license. __license__ = "MIT" @@ -423,6 +423,7 @@ class BeautifulSoup(Tag): self.currentTag = None self.tagStack = [] self.preserve_whitespace_tag_stack = [] + self.string_container_stack = [] self.pushTag(self) def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, @@ -434,14 +435,28 @@ class BeautifulSoup(Tag): sourceline=sourceline, sourcepos=sourcepos ) + def string_container(self, base_class=None): + container = base_class or NavigableString + + # There may be a general override of NavigableString. + container = self.element_classes.get( + container, container + ) + + # On top of that, we may be inside a tag that needs a special + # container class. + if self.string_container_stack: + container = self.builder.string_containers.get( + self.string_container_stack[-1].name, container + ) + return container + def new_string(self, s, subclass=None): """Create a new NavigableString associated with this BeautifulSoup object. """ - subclass = subclass or self.element_classes.get( - NavigableString, NavigableString - ) - return subclass(s) + container = self.string_container(subclass) + return container(s) def insert_before(self, successor): """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement @@ -460,6 +475,8 @@ class BeautifulSoup(Tag): tag = self.tagStack.pop() if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: self.preserve_whitespace_tag_stack.pop() + if self.string_container_stack and tag == self.string_container_stack[-1]: + self.string_container_stack.pop() #print "Pop", tag.name if self.tagStack: self.currentTag = self.tagStack[-1] @@ -474,19 +491,14 @@ class BeautifulSoup(Tag): self.currentTag = self.tagStack[-1] if tag.name in self.builder.preserve_whitespace_tags: self.preserve_whitespace_tag_stack.append(tag) + if tag.name in self.builder.string_containers: + self.string_container_stack.append(tag) def endData(self, containerClass=None): """Method called by the TreeBuilder when the end of a data segment occurs. """ - # Default container is NavigableString. - containerClass = containerClass or NavigableString - - # The user may want us to instantiate some alias for the - # container class. - containerClass = self.element_classes.get( - containerClass, containerClass - ) + containerClass = self.string_container(containerClass) if self.current_data: current_data = u''.join(self.current_data) diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index e8d78f9..7d3a6eb 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -7,8 +7,11 @@ import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, + Stylesheet, + Script, + TemplateString, nonwhitespace_re - ) +) __all__ = [ 'HTMLTreeBuilder', @@ -111,7 +114,12 @@ class TreeBuilder(object): # comma-separated list of CDATA, rather than a single CDATA. DEFAULT_CDATA_LIST_ATTRIBUTES = {} + # Whitespace should be preserved inside these tags. DEFAULT_PRESERVE_WHITESPACE_TAGS = set() + + # The textual contents of tags with these names should be + # instantiated with some class other than NavigableString. + DEFAULT_STRING_CONTAINERS = {} USE_DEFAULT = object() @@ -120,12 +128,14 @@ class TreeBuilder(object): def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT, - store_line_numbers=USE_DEFAULT): + store_line_numbers=USE_DEFAULT, + string_containers=USE_DEFAULT, + ): """Constructor. :param multi_valued_attributes: If this is set to None, the TreeBuilder will not turn any values for attributes like - 'class' into lists. Setting this do a dictionary will + 'class' into lists. Setting this to a dictionary will customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES for an example. @@ -138,6 +148,12 @@ class TreeBuilder(object): are immune from pretty-printing; their contents will always be output as-is. + :param string_containers: A dictionary mapping tag names to + the classes that should be instantiated to contain the textual + contents of those tags. The default is to use NavigableString + for every tag, no matter what the name. You can override the + default by changing DEFAULT_STRING_CONTAINERS. + :param store_line_numbers: If the parser keeps track of the line numbers and positions of the original markup, that information will, by default, be stored in each corresponding @@ -155,7 +171,10 @@ class TreeBuilder(object): self.preserve_whitespace_tags = preserve_whitespace_tags if store_line_numbers == self.USE_DEFAULT: store_line_numbers = self.TRACKS_LINE_NUMBERS - self.store_line_numbers = store_line_numbers + self.store_line_numbers = store_line_numbers + if string_containers == self.USE_DEFAULT: + string_containers = self.DEFAULT_STRING_CONTAINERS + self.string_containers = string_containers def initialize_soup(self, soup): """The BeautifulSoup object has been initialized and is now @@ -369,6 +388,22 @@ class HTMLTreeBuilder(TreeBuilder): # but it may do so eventually, and this information is available if # you need to use it. block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) + + # The HTML standard defines an unusual content model for these tags. + # We represent this by using a string class other than NavigableString + # inside these tags. + # + # I made this list by going through the HTML spec + # (https://html.spec.whatwg.org/#metadata-content) and looking for + # "metadata content" elements that can contain strings. + # + # TODO: Arguably <noscript> could go here but it seems + # qualitatively different from the other tags. + DEFAULT_STRING_CONTAINERS = { + 'style': Stylesheet, + 'script': Script, + 'template': TemplateString, + } # The HTML standard defines these attributes as containing a # space-separated list of values, not a single value. That is, diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 32a0856..b36189d 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -39,7 +39,18 @@ except ImportError, e: new_html5lib = True class HTML5TreeBuilder(HTMLTreeBuilder): - """Use html5lib to build a tree.""" + """Use html5lib to build a tree. + + Note that this TreeBuilder does not support some features common + to HTML TreeBuilders. Some of these features could theoretically + be implemented, but at the very least it's quite difficult, + because html5lib moves the parse tree around as it's being built. + + * This TreeBuilder doesn't use different subclasses of NavigableString + based on the name of the tag in which the string was found. + + * You can't use a SoupStrainer to parse only part of a document. + """ NAME = "html5lib" @@ -116,6 +127,9 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): "", "html.parser", store_line_numbers=store_line_numbers, **kwargs ) + # TODO: What are **kwargs exactly? Should they be passed in + # here in addition to/instead of being passed to the BeautifulSoup + # constructor? super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) # This will be set later to an html5lib.html5parser.HTMLParser diff --git a/bs4/element.py b/bs4/element.py index e403839..8c553cd 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -992,6 +992,33 @@ class Doctype(PreformattedString): SUFFIX = u'>\n' +class Stylesheet(NavigableString): + """A NavigableString representing an stylesheet (probably + CSS). + + Used to distinguish embedded stylesheets from textual content. + """ + pass + + +class Script(NavigableString): + """A NavigableString representing an executable script (probably + Javascript). + + Used to distinguish executable code from textual content. + """ + pass + + +class TemplateString(NavigableString): + """A NavigableString representing a string found inside an HTML + template embedded in a larger document. + + Used to distinguish such strings from the main body of the document. + """ + pass + + class Tag(PageElement): """Represents an HTML or XML tag that is part of a parse tree, along with its attributes and contents. @@ -1211,7 +1238,7 @@ class Tag(PageElement): a subclass not found in this list will be ignored. By default, this means only NavigableString and CData objects will be considered. So no comments, processing instructions, - etc. + stylesheets, etc. :return: A string. """ diff --git a/bs4/testing.py b/bs4/testing.py index a162778..328bd56 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -16,6 +16,8 @@ from bs4.element import ( ContentMetaAttributeValue, Doctype, SoupStrainer, + Script, + Stylesheet, Tag ) @@ -233,6 +235,22 @@ class HTMLTreeBuilderSmokeTest(object): new_tag = soup.new_tag(name) self.assertEqual(True, new_tag.is_empty_element) + def test_special_string_containers(self): + soup = self.soup( + "<style>Some CSS</style><script>Some Javascript</script>" + ) + assert isinstance(soup.style.string, Stylesheet) + assert isinstance(soup.script.string, Script) + + soup = self.soup( + "<style><!--Some CSS--></style>" + ) + assert isinstance(soup.style.string, Stylesheet) + # The contents of the style tag resemble an HTML comment, but + # it's not treated as a comment. + self.assertEqual("<!--Some CSS-->", soup.style.string) + assert isinstance(soup.style.string, Stylesheet) + def test_pickle_and_unpickle_identity(self): # Pickling a tree, then unpickling it, yields a tree identical # to the original. diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index 6446f84..7b0a6d4 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -182,3 +182,9 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): soup = self.soup(markup, store_line_numbers=False) self.assertEqual("sourceline", soup.p.sourceline.name) self.assertEqual("sourcepos", soup.p.sourcepos.name) + + def test_special_string_containers(self): + # The html5lib tree builder doesn't support this standard feature, + # because there's no way of knowing, when a string is created, + # where in the tree it will eventually end up. + pass diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index dc88662..8d0583c 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -73,6 +73,7 @@ class TestConstructor(SoupTest): self.store_line_numbers = False self.cdata_list_attributes = [] self.preserve_whitespace_tags = [] + self.string_containers = {} def initialize_soup(self, soup): pass def feed(self, markup): @@ -186,7 +187,41 @@ class TestConstructor(SoupTest): isinstance(x, (TagPlus, StringPlus, CommentPlus)) for x in soup.recursiveChildGenerator() ) + + def test_alternate_string_containers(self): + # Test the ability to customize the string containers for + # different types of tags. + class PString(NavigableString): + pass + + class BString(NavigableString): + pass + + soup = self.soup( + "<div>Hello.<p>Here is <b>some <i>bolded</i></b> text", + string_containers = { + 'b': BString, + 'p': PString, + } + ) + + # The string before the <p> tag is a regular NavigableString. + assert isinstance(soup.div.contents[0], NavigableString) + # The string inside the <p> tag, but not inside the <i> tag, + # is a PString. + assert isinstance(soup.p.contents[0], PString) + + # Every string inside the <b> tag is a BString, even the one that + # was also inside an <i> tag. + for s in soup.b.strings: + assert isinstance(s, BString) + + # Now that parsing was complete, the string_container_stack + # (where this information was kept) has been cleared out. + self.assertEqual([], soup.string_container_stack) + + class TestWarnings(SoupTest): def _no_parser_specified(self, s, is_there=True): diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 80aaaff..7ecab9e 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -27,8 +27,11 @@ from bs4.element import ( Doctype, Formatter, NavigableString, + Script, SoupStrainer, + Stylesheet, Tag, + TemplateString, ) from bs4.testing import ( SoupTest, @@ -1408,7 +1411,7 @@ class TestElementObjects(SoupTest): self.assertEqual(soup.a.get_text(","), "a,r, , t ") self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") - def test_get_text_ignores_comments(self): + def test_get_text_ignores_special_string_containers(self): soup = self.soup("foo<!--IGNORE-->bar") self.assertEqual(soup.get_text(), "foobar") @@ -1417,10 +1420,17 @@ class TestElementObjects(SoupTest): self.assertEqual( soup.get_text(types=None), "fooIGNOREbar") - def test_all_strings_ignores_comments(self): + soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar") + self.assertEqual(soup.get_text(), "foobar") + + def test_all_strings_ignores_special_string_containers(self): soup = self.soup("foo<!--IGNORE-->bar") self.assertEqual(['foo', 'bar'], list(soup.strings)) + soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar") + self.assertEqual(['foo', 'bar'], list(soup.strings)) + + class TestCDAtaListAttributes(SoupTest): """Testing cdata-list attributes like 'class'. @@ -1874,6 +1884,31 @@ class TestNavigableStringSubclasses(SoupTest): d = Declaration("foo") self.assertEqual("<?foo?>", d.output_ready()) + def test_default_string_containers(self): + # In some cases, we use different NavigableString subclasses for + # the same text in different tags. + soup = self.soup( + "<div>text</div><script>text</script><style>text</style>" + ) + self.assertEqual( + [NavigableString, Script, Stylesheet], + [x.__class__ for x in soup.find_all(text=True)] + ) + + # The TemplateString is a little unusual because it's generally found + # _inside_ children of a <template> element, not a direct child of the + # <template> element. + soup = self.soup( + "<template>Some text<p>In a tag</p></template>Some text outside" + ) + assert all(isinstance(x, TemplateString) for x in soup.template.strings) + + # Once the <template> tag closed, we went back to using + # NavigableString. + outside = soup.template.next_sibling + assert isinstance(outside, NavigableString) + assert not isinstance(outside, TemplateString) + class TestSoupSelector(TreeTest): HTML = """ diff --git a/doc/source/index.rst b/doc/source/index.rst index 15c86b9..a233e89 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -540,7 +540,7 @@ Comments and other special strings ``Tag``, ``NavigableString``, and ``BeautifulSoup`` cover almost everything you'll see in an HTML or XML file, but there are a few -leftover bits. The only one you'll probably ever need to worry about +leftover bits. The main one you'll probably encounter is the comment:: markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>" @@ -562,9 +562,19 @@ displayed with special formatting:: # <!--Hey, buddy. Want to buy a used parser?--> # </b> +Beautiful Soup also defines classes called ``Stylesheet``, ``Script``, +and ``TemplateString``, for embedded CSS stylesheets (any strings +found inside a ``<style>`` tag), embedded Javascript (any strings +found in a ``<script>`` tag), and HTML templates (any strings inside a +``<template>`` tag). These classes work exactly the same way as +``NavigableString``; their only purpose is to make it easier to pick +out the main body of the page, by ignoring strings that represent +something else. (These classes are new in Beautiful Soup 4.9.0, and +the html5lib parser doesn't use them.) + Beautiful Soup defines classes for anything else that might show up in an XML document: ``CData``, ``ProcessingInstruction``, -``Declaration``, and ``Doctype``. Just like ``Comment``, these classes +``Declaration``, and ``Doctype``. Like ``Comment``, these classes are subclasses of ``NavigableString`` that add something extra to the string. Here's an example that replaces the comment with a CDATA block:: @@ -577,7 +587,7 @@ block:: # <b> # <![CDATA[A CDATA block]]> # </b> - + Navigating the tree =================== @@ -11,7 +11,7 @@ setup( # NOTE: We can't import __version__ from bs4 because bs4/__init__.py is Python 2 code, # and converting it to Python 3 means going through this code to run 2to3. # So we have to specify it twice for the time being. - version = '4.8.2', + version = '4.9.0', author="Leonard Richardson", author_email='leonardr@segfault.org', url="http://www.crummy.com/software/BeautifulSoup/bs4/", |