From 0df054db08ef3286482694ee0c9aa85b5313dfd2 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sun, 14 Jul 2019 17:09:58 -0400 Subject: Give the Formatter class more control over formatting decisions. --- bs4/builder/__init__.py | 18 ++- bs4/element.py | 296 ++++++++++++++++++++++-------------------------- bs4/tests/test_tree.py | 18 +-- 3 files changed, 158 insertions(+), 174 deletions(-) (limited to 'bs4') diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index c5e6e84..e087f07 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -7,7 +7,6 @@ import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, - HTMLAwareEntitySubstitution, nonwhitespace_re ) @@ -90,7 +89,6 @@ class TreeBuilder(object): is_xml = False picklable = False - preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. @@ -98,9 +96,11 @@ class TreeBuilder(object): # comma-separated list of CDATA, rather than a single CDATA. DEFAULT_CDATA_LIST_ATTRIBUTES = {} + DEFAULT_PRESERVE_WHITESPACE_TAGS = set() + USE_DEFAULT = object() - def __init__(self, multi_valued_attributes=USE_DEFAULT): + def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT): """Constructor. :param multi_valued_attributes: If this is set to None, the @@ -110,14 +110,19 @@ class TreeBuilder(object): for an example. Internally, these are called "CDATA list attributes", but that - probably doesn't make sense to an end-use, so the argument ame + probably doesn't make sense to an end-user, so the argument name is `multi_valued_attributes`. + + :param preserve_whitespace_tags: """ self.soup = None if multi_valued_attributes is self.USE_DEFAULT: multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES self.cdata_list_attributes = multi_valued_attributes - + if preserve_whitespace_tags is self.USE_DEFAULT: + preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS + self.preserve_whitespace_tags = preserve_whitespace_tags + def initialize_soup(self, soup): """The BeautifulSoup object has been initialized and is now being associated with the TreeBuilder. @@ -253,7 +258,6 @@ class HTMLTreeBuilder(TreeBuilder): Such as which tags are empty-element tags. """ - preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags empty_element_tags = set([ # These are from HTML5. 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', @@ -292,6 +296,8 @@ class HTMLTreeBuilder(TreeBuilder): "output" : ["for"], } + DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) + def set_up_substitutions(self, tag): # We are only interested in tags if tag.name != 'meta': diff --git a/bs4/element.py b/bs4/element.py index e8e48df..a233dcd 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -99,134 +99,114 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): return match.group(1) + encoding return self.CHARSET_RE.sub(rewrite, self.original_value) -class HTMLAwareEntitySubstitution(EntitySubstitution): - """Entity substitution rules that are aware of some HTML quirks. +class Formatter(EntitySubstitution): + """Describes a strategy to use when outputting a parse tree to a string. - Specifically, the contents of