summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2020-04-05 15:43:58 -0400
committerLeonard Richardson <leonardr@segfault.org>2020-04-05 15:43:58 -0400
commita6f897b213bb08f0d8d8a1528937541c280abbd6 (patch)
tree866d3392a854ea27a172e9b456b2160307e39363
parentddadf13ef66122d75eadaf7f10e0937429e6a3a6 (diff)
Embedded CSS and Javascript is now stored in distinct Stylesheet and
Script tags, which are ignored by methods like get_text(). This feature is not supported by the html5lib treebuilder. [bug=1868861]
-rw-r--r--CHANGELOG6
-rw-r--r--bs4/__init__.py40
-rw-r--r--bs4/builder/__init__.py43
-rw-r--r--bs4/builder/_html5lib.py16
-rw-r--r--bs4/element.py29
-rw-r--r--bs4/testing.py18
-rw-r--r--bs4/tests/test_html5lib.py6
-rw-r--r--bs4/tests/test_soup.py35
-rw-r--r--bs4/tests/test_tree.py39
-rw-r--r--doc/source/index.rst16
-rw-r--r--setup.py2
11 files changed, 223 insertions, 27 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 5852a12..1c7d57d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,9 +1,13 @@
-= 4.9.0 (Unreleased)
+= 4.9.0 (20200405)
* Added PageElement.decomposed, a new property which lets you
check whether you've already called decompose() on a Tag or
NavigableString.
+* Embedded CSS and Javascript is now stored in distinct Stylesheet and
+ Script tags, which are ignored by methods like get_text(). This
+ feature is not supported by the html5lib treebuilder. [bug=1868861]
+
* Added a Russian translation by 'authoress' to the repository.
* Fixed an unhandled exception when formatting a Tag that had been
diff --git a/bs4/__init__.py b/bs4/__init__.py
index f828cd2..bae7fda 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -15,8 +15,8 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.8.2"
-__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
+__version__ = "4.9.0"
+__copyright__ = "Copyright (c) 2004-2020 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
@@ -423,6 +423,7 @@ class BeautifulSoup(Tag):
self.currentTag = None
self.tagStack = []
self.preserve_whitespace_tag_stack = []
+ self.string_container_stack = []
self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
@@ -434,14 +435,28 @@ class BeautifulSoup(Tag):
sourceline=sourceline, sourcepos=sourcepos
)
+ def string_container(self, base_class=None):
+ container = base_class or NavigableString
+
+ # There may be a general override of NavigableString.
+ container = self.element_classes.get(
+ container, container
+ )
+
+ # On top of that, we may be inside a tag that needs a special
+ # container class.
+ if self.string_container_stack:
+ container = self.builder.string_containers.get(
+ self.string_container_stack[-1].name, container
+ )
+ return container
+
def new_string(self, s, subclass=None):
"""Create a new NavigableString associated with this BeautifulSoup
object.
"""
- subclass = subclass or self.element_classes.get(
- NavigableString, NavigableString
- )
- return subclass(s)
+ container = self.string_container(subclass)
+ return container(s)
def insert_before(self, successor):
"""This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
@@ -460,6 +475,8 @@ class BeautifulSoup(Tag):
tag = self.tagStack.pop()
if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
self.preserve_whitespace_tag_stack.pop()
+ if self.string_container_stack and tag == self.string_container_stack[-1]:
+ self.string_container_stack.pop()
#print "Pop", tag.name
if self.tagStack:
self.currentTag = self.tagStack[-1]
@@ -474,19 +491,14 @@ class BeautifulSoup(Tag):
self.currentTag = self.tagStack[-1]
if tag.name in self.builder.preserve_whitespace_tags:
self.preserve_whitespace_tag_stack.append(tag)
+ if tag.name in self.builder.string_containers:
+ self.string_container_stack.append(tag)
def endData(self, containerClass=None):
"""Method called by the TreeBuilder when the end of a data segment
occurs.
"""
- # Default container is NavigableString.
- containerClass = containerClass or NavigableString
-
- # The user may want us to instantiate some alias for the
- # container class.
- containerClass = self.element_classes.get(
- containerClass, containerClass
- )
+ containerClass = self.string_container(containerClass)
if self.current_data:
current_data = u''.join(self.current_data)
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index e8d78f9..7d3a6eb 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -7,8 +7,11 @@ import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
+ Stylesheet,
+ Script,
+ TemplateString,
nonwhitespace_re
- )
+)
__all__ = [
'HTMLTreeBuilder',
@@ -111,7 +114,12 @@ class TreeBuilder(object):
# comma-separated list of CDATA, rather than a single CDATA.
DEFAULT_CDATA_LIST_ATTRIBUTES = {}
+ # Whitespace should be preserved inside these tags.
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
+
+ # The textual contents of tags with these names should be
+ # instantiated with some class other than NavigableString.
+ DEFAULT_STRING_CONTAINERS = {}
USE_DEFAULT = object()
@@ -120,12 +128,14 @@ class TreeBuilder(object):
def __init__(self, multi_valued_attributes=USE_DEFAULT,
preserve_whitespace_tags=USE_DEFAULT,
- store_line_numbers=USE_DEFAULT):
+ store_line_numbers=USE_DEFAULT,
+ string_containers=USE_DEFAULT,
+ ):
"""Constructor.
:param multi_valued_attributes: If this is set to None, the
TreeBuilder will not turn any values for attributes like
- 'class' into lists. Setting this do a dictionary will
+ 'class' into lists. Setting this to a dictionary will
customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
for an example.
@@ -138,6 +148,12 @@ class TreeBuilder(object):
are immune from pretty-printing; their contents will always be
output as-is.
+ :param string_containers: A dictionary mapping tag names to
+ the classes that should be instantiated to contain the textual
+ contents of those tags. The default is to use NavigableString
+ for every tag, no matter what the name. You can override the
+ default by changing DEFAULT_STRING_CONTAINERS.
+
:param store_line_numbers: If the parser keeps track of the
line numbers and positions of the original markup, that
information will, by default, be stored in each corresponding
@@ -155,7 +171,10 @@ class TreeBuilder(object):
self.preserve_whitespace_tags = preserve_whitespace_tags
if store_line_numbers == self.USE_DEFAULT:
store_line_numbers = self.TRACKS_LINE_NUMBERS
- self.store_line_numbers = store_line_numbers
+ self.store_line_numbers = store_line_numbers
+ if string_containers == self.USE_DEFAULT:
+ string_containers = self.DEFAULT_STRING_CONTAINERS
+ self.string_containers = string_containers
def initialize_soup(self, soup):
"""The BeautifulSoup object has been initialized and is now
@@ -369,6 +388,22 @@ class HTMLTreeBuilder(TreeBuilder):
# but it may do so eventually, and this information is available if
# you need to use it.
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
+
+ # The HTML standard defines an unusual content model for these tags.
+ # We represent this by using a string class other than NavigableString
+ # inside these tags.
+ #
+ # I made this list by going through the HTML spec
+ # (https://html.spec.whatwg.org/#metadata-content) and looking for
+ # "metadata content" elements that can contain strings.
+ #
+ # TODO: Arguably <noscript> could go here but it seems
+ # qualitatively different from the other tags.
+ DEFAULT_STRING_CONTAINERS = {
+ 'style': Stylesheet,
+ 'script': Script,
+ 'template': TemplateString,
+ }
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 32a0856..b36189d 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -39,7 +39,18 @@ except ImportError, e:
new_html5lib = True
class HTML5TreeBuilder(HTMLTreeBuilder):
- """Use html5lib to build a tree."""
+ """Use html5lib to build a tree.
+
+ Note that this TreeBuilder does not support some features common
+ to HTML TreeBuilders. Some of these features could theoretically
+ be implemented, but at the very least it's quite difficult,
+ because html5lib moves the parse tree around as it's being built.
+
+ * This TreeBuilder doesn't use different subclasses of NavigableString
+ based on the name of the tag in which the string was found.
+
+ * You can't use a SoupStrainer to parse only part of a document.
+ """
NAME = "html5lib"
@@ -116,6 +127,9 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
"", "html.parser", store_line_numbers=store_line_numbers,
**kwargs
)
+ # TODO: What are **kwargs exactly? Should they be passed in
+ # here in addition to/instead of being passed to the BeautifulSoup
+ # constructor?
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
# This will be set later to an html5lib.html5parser.HTMLParser
diff --git a/bs4/element.py b/bs4/element.py
index e403839..8c553cd 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -992,6 +992,33 @@ class Doctype(PreformattedString):
SUFFIX = u'>\n'
+class Stylesheet(NavigableString):
+ """A NavigableString representing an stylesheet (probably
+ CSS).
+
+ Used to distinguish embedded stylesheets from textual content.
+ """
+ pass
+
+
+class Script(NavigableString):
+ """A NavigableString representing an executable script (probably
+ Javascript).
+
+ Used to distinguish executable code from textual content.
+ """
+ pass
+
+
+class TemplateString(NavigableString):
+ """A NavigableString representing a string found inside an HTML
+ template embedded in a larger document.
+
+ Used to distinguish such strings from the main body of the document.
+ """
+ pass
+
+
class Tag(PageElement):
"""Represents an HTML or XML tag that is part of a parse tree, along
with its attributes and contents.
@@ -1211,7 +1238,7 @@ class Tag(PageElement):
a subclass not found in this list will be ignored. By
default, this means only NavigableString and CData objects
will be considered. So no comments, processing instructions,
- etc.
+ stylesheets, etc.
:return: A string.
"""
diff --git a/bs4/testing.py b/bs4/testing.py
index a162778..328bd56 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -16,6 +16,8 @@ from bs4.element import (
ContentMetaAttributeValue,
Doctype,
SoupStrainer,
+ Script,
+ Stylesheet,
Tag
)
@@ -233,6 +235,22 @@ class HTMLTreeBuilderSmokeTest(object):
new_tag = soup.new_tag(name)
self.assertEqual(True, new_tag.is_empty_element)
+ def test_special_string_containers(self):
+ soup = self.soup(
+ "<style>Some CSS</style><script>Some Javascript</script>"
+ )
+ assert isinstance(soup.style.string, Stylesheet)
+ assert isinstance(soup.script.string, Script)
+
+ soup = self.soup(
+ "<style><!--Some CSS--></style>"
+ )
+ assert isinstance(soup.style.string, Stylesheet)
+ # The contents of the style tag resemble an HTML comment, but
+ # it's not treated as a comment.
+ self.assertEqual("<!--Some CSS-->", soup.style.string)
+ assert isinstance(soup.style.string, Stylesheet)
+
def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical
# to the original.
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index 6446f84..7b0a6d4 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -182,3 +182,9 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
soup = self.soup(markup, store_line_numbers=False)
self.assertEqual("sourceline", soup.p.sourceline.name)
self.assertEqual("sourcepos", soup.p.sourcepos.name)
+
+ def test_special_string_containers(self):
+ # The html5lib tree builder doesn't support this standard feature,
+ # because there's no way of knowing, when a string is created,
+ # where in the tree it will eventually end up.
+ pass
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index dc88662..8d0583c 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -73,6 +73,7 @@ class TestConstructor(SoupTest):
self.store_line_numbers = False
self.cdata_list_attributes = []
self.preserve_whitespace_tags = []
+ self.string_containers = {}
def initialize_soup(self, soup):
pass
def feed(self, markup):
@@ -186,7 +187,41 @@ class TestConstructor(SoupTest):
isinstance(x, (TagPlus, StringPlus, CommentPlus))
for x in soup.recursiveChildGenerator()
)
+
+ def test_alternate_string_containers(self):
+ # Test the ability to customize the string containers for
+ # different types of tags.
+ class PString(NavigableString):
+ pass
+
+ class BString(NavigableString):
+ pass
+
+ soup = self.soup(
+ "<div>Hello.<p>Here is <b>some <i>bolded</i></b> text",
+ string_containers = {
+ 'b': BString,
+ 'p': PString,
+ }
+ )
+
+ # The string before the <p> tag is a regular NavigableString.
+ assert isinstance(soup.div.contents[0], NavigableString)
+ # The string inside the <p> tag, but not inside the <i> tag,
+ # is a PString.
+ assert isinstance(soup.p.contents[0], PString)
+
+ # Every string inside the <b> tag is a BString, even the one that
+ # was also inside an <i> tag.
+ for s in soup.b.strings:
+ assert isinstance(s, BString)
+
+ # Now that parsing was complete, the string_container_stack
+ # (where this information was kept) has been cleared out.
+ self.assertEqual([], soup.string_container_stack)
+
+
class TestWarnings(SoupTest):
def _no_parser_specified(self, s, is_there=True):
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 80aaaff..7ecab9e 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -27,8 +27,11 @@ from bs4.element import (
Doctype,
Formatter,
NavigableString,
+ Script,
SoupStrainer,
+ Stylesheet,
Tag,
+ TemplateString,
)
from bs4.testing import (
SoupTest,
@@ -1408,7 +1411,7 @@ class TestElementObjects(SoupTest):
self.assertEqual(soup.a.get_text(","), "a,r, , t ")
self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
- def test_get_text_ignores_comments(self):
+ def test_get_text_ignores_special_string_containers(self):
soup = self.soup("foo<!--IGNORE-->bar")
self.assertEqual(soup.get_text(), "foobar")
@@ -1417,10 +1420,17 @@ class TestElementObjects(SoupTest):
self.assertEqual(
soup.get_text(types=None), "fooIGNOREbar")
- def test_all_strings_ignores_comments(self):
+ soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
+ self.assertEqual(soup.get_text(), "foobar")
+
+ def test_all_strings_ignores_special_string_containers(self):
soup = self.soup("foo<!--IGNORE-->bar")
self.assertEqual(['foo', 'bar'], list(soup.strings))
+ soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
+ self.assertEqual(['foo', 'bar'], list(soup.strings))
+
+
class TestCDAtaListAttributes(SoupTest):
"""Testing cdata-list attributes like 'class'.
@@ -1874,6 +1884,31 @@ class TestNavigableStringSubclasses(SoupTest):
d = Declaration("foo")
self.assertEqual("<?foo?>", d.output_ready())
+ def test_default_string_containers(self):
+ # In some cases, we use different NavigableString subclasses for
+ # the same text in different tags.
+ soup = self.soup(
+ "<div>text</div><script>text</script><style>text</style>"
+ )
+ self.assertEqual(
+ [NavigableString, Script, Stylesheet],
+ [x.__class__ for x in soup.find_all(text=True)]
+ )
+
+ # The TemplateString is a little unusual because it's generally found
+ # _inside_ children of a <template> element, not a direct child of the
+ # <template> element.
+ soup = self.soup(
+ "<template>Some text<p>In a tag</p></template>Some text outside"
+ )
+ assert all(isinstance(x, TemplateString) for x in soup.template.strings)
+
+ # Once the <template> tag closed, we went back to using
+ # NavigableString.
+ outside = soup.template.next_sibling
+ assert isinstance(outside, NavigableString)
+ assert not isinstance(outside, TemplateString)
+
class TestSoupSelector(TreeTest):
HTML = """
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 15c86b9..a233e89 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -540,7 +540,7 @@ Comments and other special strings
``Tag``, ``NavigableString``, and ``BeautifulSoup`` cover almost
everything you'll see in an HTML or XML file, but there are a few
-leftover bits. The only one you'll probably ever need to worry about
+leftover bits. The main one you'll probably encounter
is the comment::
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
@@ -562,9 +562,19 @@ displayed with special formatting::
# <!--Hey, buddy. Want to buy a used parser?-->
# </b>
+Beautiful Soup also defines classes called ``Stylesheet``, ``Script``,
+and ``TemplateString``, for embedded CSS stylesheets (any strings
+found inside a ``<style>`` tag), embedded Javascript (any strings
+found in a ``<script>`` tag), and HTML templates (any strings inside a
+``<template>`` tag). These classes work exactly the same way as
+``NavigableString``; their only purpose is to make it easier to pick
+out the main body of the page, by ignoring strings that represent
+something else. (These classes are new in Beautiful Soup 4.9.0, and
+the html5lib parser doesn't use them.)
+
Beautiful Soup defines classes for anything else that might show up in
an XML document: ``CData``, ``ProcessingInstruction``,
-``Declaration``, and ``Doctype``. Just like ``Comment``, these classes
+``Declaration``, and ``Doctype``. Like ``Comment``, these classes
are subclasses of ``NavigableString`` that add something extra to the
string. Here's an example that replaces the comment with a CDATA
block::
@@ -577,7 +587,7 @@ block::
# <b>
# <![CDATA[A CDATA block]]>
# </b>
-
+
Navigating the tree
===================
diff --git a/setup.py b/setup.py
index 23af491..31c4541 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@ setup(
# NOTE: We can't import __version__ from bs4 because bs4/__init__.py is Python 2 code,
# and converting it to Python 3 means going through this code to run 2to3.
# So we have to specify it twice for the time being.
- version = '4.8.2',
+ version = '4.9.0',
author="Leonard Richardson",
author_email='leonardr@segfault.org',
url="http://www.crummy.com/software/BeautifulSoup/bs4/",