summaryrefslogtreecommitdiff
path: root/bs4/builder/__init__.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2020-04-05 15:43:58 -0400
committerLeonard Richardson <leonardr@segfault.org>2020-04-05 15:43:58 -0400
commita6f897b213bb08f0d8d8a1528937541c280abbd6 (patch)
tree866d3392a854ea27a172e9b456b2160307e39363 /bs4/builder/__init__.py
parentddadf13ef66122d75eadaf7f10e0937429e6a3a6 (diff)
Embedded CSS and Javascript is now stored in distinct Stylesheet and
Script tags, which are ignored by methods like get_text(). This feature is not supported by the html5lib treebuilder. [bug=1868861]
Diffstat (limited to 'bs4/builder/__init__.py')
-rw-r--r--bs4/builder/__init__.py43
1 files changed, 39 insertions, 4 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index e8d78f9..7d3a6eb 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -7,8 +7,11 @@ import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
+ Stylesheet,
+ Script,
+ TemplateString,
nonwhitespace_re
- )
+)
__all__ = [
'HTMLTreeBuilder',
@@ -111,7 +114,12 @@ class TreeBuilder(object):
# comma-separated list of CDATA, rather than a single CDATA.
DEFAULT_CDATA_LIST_ATTRIBUTES = {}
+ # Whitespace should be preserved inside these tags.
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
+
+ # The textual contents of tags with these names should be
+ # instantiated with some class other than NavigableString.
+ DEFAULT_STRING_CONTAINERS = {}
USE_DEFAULT = object()
@@ -120,12 +128,14 @@ class TreeBuilder(object):
def __init__(self, multi_valued_attributes=USE_DEFAULT,
preserve_whitespace_tags=USE_DEFAULT,
- store_line_numbers=USE_DEFAULT):
+ store_line_numbers=USE_DEFAULT,
+ string_containers=USE_DEFAULT,
+ ):
"""Constructor.
:param multi_valued_attributes: If this is set to None, the
TreeBuilder will not turn any values for attributes like
- 'class' into lists. Setting this do a dictionary will
+ 'class' into lists. Setting this to a dictionary will
customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
for an example.
@@ -138,6 +148,12 @@ class TreeBuilder(object):
are immune from pretty-printing; their contents will always be
output as-is.
+ :param string_containers: A dictionary mapping tag names to
+ the classes that should be instantiated to contain the textual
+ contents of those tags. The default is to use NavigableString
+ for every tag, no matter what the name. You can override the
+ default by changing DEFAULT_STRING_CONTAINERS.
+
:param store_line_numbers: If the parser keeps track of the
line numbers and positions of the original markup, that
information will, by default, be stored in each corresponding
@@ -155,7 +171,10 @@ class TreeBuilder(object):
self.preserve_whitespace_tags = preserve_whitespace_tags
if store_line_numbers == self.USE_DEFAULT:
store_line_numbers = self.TRACKS_LINE_NUMBERS
- self.store_line_numbers = store_line_numbers
+ self.store_line_numbers = store_line_numbers
+ if string_containers == self.USE_DEFAULT:
+ string_containers = self.DEFAULT_STRING_CONTAINERS
+ self.string_containers = string_containers
def initialize_soup(self, soup):
"""The BeautifulSoup object has been initialized and is now
@@ -369,6 +388,22 @@ class HTMLTreeBuilder(TreeBuilder):
# but it may do so eventually, and this information is available if
# you need to use it.
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
+
+ # The HTML standard defines an unusual content model for these tags.
+ # We represent this by using a string class other than NavigableString
+ # inside these tags.
+ #
+ # I made this list by going through the HTML spec
+ # (https://html.spec.whatwg.org/#metadata-content) and looking for
+ # "metadata content" elements that can contain strings.
+ #
+ # TODO: Arguably <noscript> could go here but it seems
+ # qualitatively different from the other tags.
+ DEFAULT_STRING_CONTAINERS = {
+ 'style': Stylesheet,
+ 'script': Script,
+ 'template': TemplateString,
+ }
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,