diff options
author | Leonard Richardson <leonardr@segfault.org> | 2020-04-05 15:43:58 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2020-04-05 15:43:58 -0400 |
commit | a6f897b213bb08f0d8d8a1528937541c280abbd6 (patch) | |
tree | 866d3392a854ea27a172e9b456b2160307e39363 /bs4/builder/__init__.py | |
parent | ddadf13ef66122d75eadaf7f10e0937429e6a3a6 (diff) |
Embedded CSS and Javascript is now stored in distinct Stylesheet and
Script tags, which are ignored by methods like get_text(). This
feature is not supported by the html5lib treebuilder. [bug=1868861]
Diffstat (limited to 'bs4/builder/__init__.py')
-rw-r--r-- | bs4/builder/__init__.py | 43 |
1 files changed, 39 insertions, 4 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index e8d78f9..7d3a6eb 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -7,8 +7,11 @@ import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, + Stylesheet, + Script, + TemplateString, nonwhitespace_re - ) +) __all__ = [ 'HTMLTreeBuilder', @@ -111,7 +114,12 @@ class TreeBuilder(object): # comma-separated list of CDATA, rather than a single CDATA. DEFAULT_CDATA_LIST_ATTRIBUTES = {} + # Whitespace should be preserved inside these tags. DEFAULT_PRESERVE_WHITESPACE_TAGS = set() + + # The textual contents of tags with these names should be + # instantiated with some class other than NavigableString. + DEFAULT_STRING_CONTAINERS = {} USE_DEFAULT = object() @@ -120,12 +128,14 @@ class TreeBuilder(object): def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT, - store_line_numbers=USE_DEFAULT): + store_line_numbers=USE_DEFAULT, + string_containers=USE_DEFAULT, + ): """Constructor. :param multi_valued_attributes: If this is set to None, the TreeBuilder will not turn any values for attributes like - 'class' into lists. Setting this do a dictionary will + 'class' into lists. Setting this to a dictionary will customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES for an example. @@ -138,6 +148,12 @@ class TreeBuilder(object): are immune from pretty-printing; their contents will always be output as-is. + :param string_containers: A dictionary mapping tag names to + the classes that should be instantiated to contain the textual + contents of those tags. The default is to use NavigableString + for every tag, no matter what the name. You can override the + default by changing DEFAULT_STRING_CONTAINERS. + :param store_line_numbers: If the parser keeps track of the line numbers and positions of the original markup, that information will, by default, be stored in each corresponding @@ -155,7 +171,10 @@ class TreeBuilder(object): self.preserve_whitespace_tags = preserve_whitespace_tags if store_line_numbers == self.USE_DEFAULT: store_line_numbers = self.TRACKS_LINE_NUMBERS - self.store_line_numbers = store_line_numbers + self.store_line_numbers = store_line_numbers + if string_containers == self.USE_DEFAULT: + string_containers = self.DEFAULT_STRING_CONTAINERS + self.string_containers = string_containers def initialize_soup(self, soup): """The BeautifulSoup object has been initialized and is now @@ -369,6 +388,22 @@ class HTMLTreeBuilder(TreeBuilder): # but it may do so eventually, and this information is available if # you need to use it. block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) + + # The HTML standard defines an unusual content model for these tags. + # We represent this by using a string class other than NavigableString + # inside these tags. + # + # I made this list by going through the HTML spec + # (https://html.spec.whatwg.org/#metadata-content) and looking for + # "metadata content" elements that can contain strings. + # + # TODO: Arguably <noscript> could go here but it seems + # qualitatively different from the other tags. + DEFAULT_STRING_CONTAINERS = { + 'style': Stylesheet, + 'script': Script, + 'template': TemplateString, + } # The HTML standard defines these attributes as containing a # space-separated list of values, not a single value. That is, |