summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2019-07-14 17:09:58 -0400
committerLeonard Richardson <leonardr@segfault.org>2019-07-14 17:09:58 -0400
commit0df054db08ef3286482694ee0c9aa85b5313dfd2 (patch)
treed1b38991f1148abccb0862484d87d760654cd18f
parent519afbe269b671e15a1f1d2aecfe4fc579b61efc (diff)
Give the Formatter class more control over formatting decisions.
-rw-r--r--bs4/builder/__init__.py18
-rw-r--r--bs4/element.py296
-rw-r--r--bs4/tests/test_tree.py18
3 files changed, 158 insertions, 174 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index c5e6e84..e087f07 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -7,7 +7,6 @@ import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
- HTMLAwareEntitySubstitution,
nonwhitespace_re
)
@@ -90,7 +89,6 @@ class TreeBuilder(object):
is_xml = False
picklable = False
- preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
@@ -98,9 +96,11 @@ class TreeBuilder(object):
# comma-separated list of CDATA, rather than a single CDATA.
DEFAULT_CDATA_LIST_ATTRIBUTES = {}
+ DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
+
USE_DEFAULT = object()
- def __init__(self, multi_valued_attributes=USE_DEFAULT):
+ def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT):
"""Constructor.
:param multi_valued_attributes: If this is set to None, the
@@ -110,14 +110,19 @@ class TreeBuilder(object):
for an example.
Internally, these are called "CDATA list attributes", but that
- probably doesn't make sense to an end-use, so the argument ame
+ probably doesn't make sense to an end-user, so the argument name
is `multi_valued_attributes`.
+
+ :param preserve_whitespace_tags:
"""
self.soup = None
if multi_valued_attributes is self.USE_DEFAULT:
multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
self.cdata_list_attributes = multi_valued_attributes
-
+ if preserve_whitespace_tags is self.USE_DEFAULT:
+ preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
+ self.preserve_whitespace_tags = preserve_whitespace_tags
+
def initialize_soup(self, soup):
"""The BeautifulSoup object has been initialized and is now
being associated with the TreeBuilder.
@@ -253,7 +258,6 @@ class HTMLTreeBuilder(TreeBuilder):
Such as which tags are empty-element tags.
"""
- preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
empty_element_tags = set([
# These are from HTML5.
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
@@ -292,6 +296,8 @@ class HTMLTreeBuilder(TreeBuilder):
"output" : ["for"],
}
+ DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
+
def set_up_substitutions(self, tag):
# We are only interested in <meta> tags
if tag.name != 'meta':
diff --git a/bs4/element.py b/bs4/element.py
index e8e48df..a233dcd 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -99,134 +99,114 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value)
-class HTMLAwareEntitySubstitution(EntitySubstitution):
- """Entity substitution rules that are aware of some HTML quirks.
+class Formatter(EntitySubstitution):
+ """Describes a strategy to use when outputting a parse tree to a string.
- Specifically, the contents of <script> and <style> tags should not
- undergo entity substitution.
-
- Incoming NavigableString objects are checked to see if they're the
- direct children of a <script> or <style> tag.
+ Some parts of this strategy come from the distinction between
+ HTML4, HTML5, and XML. Others are configurable by the user.
"""
+ # Registries of XML and HTML formatters.
+ XML_FORMATTERS = {}
+ HTML_FORMATTERS = {}
- cdata_containing_tags = set(["script", "style"])
+ HTML = 'html'
+ XML = 'xml'
- preformatted_tags = set(["pre"])
+ HTML_DEFAULTS = dict(
+ cdata_containing_tags=set(["script", "style"]),
+ preformatted_tags=set(["pre"]),
+ )
- preserve_whitespace_tags = set(['pre', 'textarea'])
+ def _default(self, language, value, kwarg):
+ if value is not None:
+ return value
+ if language == self.XML:
+ return set()
+ return self.HTML_DEFAULTS[kwarg]
+
+ def __init__(
+ self, language=None, entity_substitution=None,
+ void_element_close_prefix='/', cdata_containing_tags=None,
+ preformatted_tags=None,
+ ):
+ """
- @classmethod
- def _substitute_if_appropriate(cls, ns, f):
+ :param void_element_close_prefix: By default, represent void
+ elements as <tag/> rather than <tag>
+ """
+ self.language = language
+ self.entity_substitution = entity_substitution
+ self.void_element_close_prefix = void_element_close_prefix
+ self.cdata_containing_tags = self._default(
+ language, cdata_containing_tags, 'cdata_containing_tags'
+ )
+
+ def substitute(self, ns):
+ """Process a string that needs to undergo entity substitution."""
+ if not self.entity_substitution:
+ return ns
if (isinstance(ns, NavigableString)
and ns.parent is not None
- and ns.parent.name in cls.cdata_containing_tags):
+ and ns.parent.name in self.cdata_containing_tags):
# Do nothing.
return ns
# Substitute.
- return f(ns)
-
- @classmethod
- def substitute_html(cls, ns):
- return cls._substitute_if_appropriate(
- ns, EntitySubstitution.substitute_html)
+ return self.entity_substitution(ns)
- @classmethod
- def substitute_xml(cls, ns):
- return cls._substitute_if_appropriate(
- ns, EntitySubstitution.substitute_xml)
-
-class Formatter(object):
- """Contains information about how to format a parse tree."""
+ def attribute_value(self, value):
+ """Process the value of an attribute."""
+ return self.substitute(value)
- # By default, represent void elements as <tag/> rather than <tag>
- void_element_close_prefix = '/'
-
- def substitute(self, *args, **kwargs):
- """Transform certain characters into named entities."""
- raise NotImplementedError()
-
- def sort_attributes(self, attributes):
+ def attributes(self, tag):
"""Reorder a tag's attributes however you want."""
- return sorted(attributes.items())
-
-
+ return sorted(tag.attrs.items())
+
class HTMLFormatter(Formatter):
- """The default HTML formatter."""
- def substitute(self, *args, **kwargs):
- return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
-
-class MinimalHTMLFormatter(Formatter):
- """A minimal HTML formatter."""
- def substitute(self, *args, **kwargs):
- return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
+ REGISTRY = {}
+ def __init__(self, *args, **kwargs):
+ return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
-class HTML5Formatter(HTMLFormatter):
- """An HTML formatter that omits the slash in a void tag."""
- void_element_close_prefix = None
-
class XMLFormatter(Formatter):
- """Substitute only the essential XML entities."""
- def substitute(self, *args, **kwargs):
- return EntitySubstitution.substitute_xml(*args, **kwargs)
-
-class HTMLXMLFormatter(Formatter):
- """Format XML using HTML rules."""
- def substitute(self, *args, **kwargs):
- return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
-
+ REGISTRY = {}
+ def __init__(self, *args, **kwargs):
+ return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
+
+# Set up aliases for the default formatters.
+HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
+ entity_substitution=EntitySubstitution.substitute_html
+)
+HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
+ entity_substitution=EntitySubstitution.substitute_html,
+ void_element_close_prefix = None
+)
+HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
+ entity_substitution=EntitySubstitution.substitute_xml
+)
+HTMLFormatter.REGISTRY[None] = HTMLFormatter(
+ entity_substitution=None
+)
+XMLFormatter.REGISTRY["html"] = XMLFormatter(
+ entity_substitution=EntitySubstitution.substitute_html
+)
+XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
+ entity_substitution=EntitySubstitution.substitute_xml
+)
+XMLFormatter.REGISTRY[None] = Formatter(
+ Formatter(Formatter.XML, entity_substitution=None)
+)
class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
-
- # There are five possible values for the "formatter" argument passed in
- # to methods like encode() and prettify():
- #
- # "html" - All Unicode characters with corresponding HTML entities
- # are converted to those entities on output.
- # "html5" - The same as "html", but empty void tags are represented as
- # <tag> rather than <tag/>
- # "minimal" - Bare ampersands and angle brackets are converted to
- # XML entities: &amp; &lt; &gt;
- # None - The null formatter. Unicode characters are never
- # converted to entities. This is not recommended, but it's
- # faster than "minimal".
- # A callable function - it will be called on every string that needs to undergo entity substitution.
- # A Formatter instance - Formatter.substitute(string) will be called on every string that
- # needs to undergo entity substitution.
- #
-
- # In an HTML document, the default "html", "html5", and "minimal"
- # functions will leave the contents of <script> and <style> tags
- # alone. For an XML document, all tags will be given the same
- # treatment.
-
- HTML_FORMATTERS = {
- "html" : HTMLFormatter(),
- "html5" : HTML5Formatter(),
- "minimal" : MinimalHTMLFormatter(),
- None : None
- }
-
- XML_FORMATTERS = {
- "html" : HTMLXMLFormatter(),
- "minimal" : XMLFormatter(),
- None : None
- }
-
- def format_string(self, s, formatter='minimal'):
+
+ def format_string(self, s, formatter):
"""Format the given string using the given formatter."""
- if isinstance(formatter, basestring):
- formatter = self._formatter_for_name(formatter)
if formatter is None:
- output = s
- else:
- if isinstance(formatter, Callable):
- # Backwards compatibility -- you used to pass in a formatting method.
- output = formatter(s)
- else:
- output = formatter.substitute(s)
+ return s
+ if not isinstance(formatter, Formatter):
+ formatter = self.formatter_by_name(formatter)
+ output = formatter.substitute(s)
return output
@property
@@ -253,13 +233,6 @@ class PageElement(object):
return getattr(self, 'is_xml', False)
return self.parent._is_xml
- def _formatter_for_name(self, name):
- "Look up a formatter function based on its name and the tree."
- if self._is_xml:
- return self.XML_FORMATTERS.get(name, XMLFormatter())
- else:
- return self.HTML_FORMATTERS.get(name, HTMLFormatter())
-
def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None):
"""Sets up the initial relations between this element and
@@ -765,10 +738,12 @@ class PreformattedString(NavigableString):
but the return value will be ignored.
"""
- def output_ready(self, formatter="minimal"):
- """CData strings are passed into the formatter.
- But the return value is ignored."""
- self.format_string(self, formatter)
+ def output_ready(self, formatter=None):
+ """CData strings are passed into the formatter, purely
+ for any side effects. The return value is ignored.
+ """
+ if formatter:
+ self.format_string(self, formatter)
return self.PREFIX + self + self.SUFFIX
class CData(PreformattedString):
@@ -836,14 +811,6 @@ class Tag(PageElement):
self.name = name
self.namespace = namespace
self.prefix = prefix
- if builder is not None:
- preserve_whitespace_tags = builder.preserve_whitespace_tags
- else:
- if is_xml:
- preserve_whitespace_tags = []
- else:
- preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
- self.preserve_whitespace_tags = preserve_whitespace_tags
if attrs is None:
attrs = {}
elif attrs:
@@ -887,6 +854,10 @@ class Tag(PageElement):
# (unlike can_be_empty_element), we almost never need
# to check this.
self.cdata_list_attributes = builder.cdata_list_attributes
+
+ # Keep track of the names that might cause this tag to be treated as a
+ # whitespace-preserved tag.
+ self.preserve_whitespace_tags = builder.preserve_whitespace_tags
parserClass = _alias("parser_class") # BS3
@@ -1135,14 +1106,6 @@ class Tag(PageElement):
u = self.decode(indent_level, encoding, formatter)
return u.encode(encoding, errors)
- def _should_pretty_print(self, indent_level):
- """Should this tag be pretty-printed?"""
-
- return (
- indent_level is not None
- and self.name not in self.preserve_whitespace_tags
- )
-
def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
@@ -1158,32 +1121,29 @@ class Tag(PageElement):
# First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again.
- if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable):
- formatter = self._formatter_for_name(formatter)
+ if not isinstance(formatter, Formatter):
+ formatter = self.formatter_for_name(formatter)
+ attributes = formatter.attributes(self)
attrs = []
- if self.attrs:
- if isinstance(formatter, Formatter):
- sorted_attrs = formatter.sort_attributes(self.attrs)
+ for key, val in attributes:
+ if val is None:
+ decoded = key
else:
- sorted_attrs = self.attrs.items()
- for key, val in sorted_attrs:
- if val is None:
- decoded = key
- else:
- if isinstance(val, list) or isinstance(val, tuple):
- val = ' '.join(val)
- elif not isinstance(val, basestring):
- val = unicode(val)
- elif (
+ if isinstance(val, list) or isinstance(val, tuple):
+ val = ' '.join(val)
+ elif not isinstance(val, basestring):
+ val = unicode(val)
+ elif (
isinstance(val, AttributeValueWithCharsetSubstitution)
- and eventual_encoding is not None):
- val = val.encode(eventual_encoding)
-
- text = self.format_string(val, formatter)
- decoded = (
- unicode(key) + '='
- + EntitySubstitution.quoted_attribute_value(text))
- attrs.append(decoded)
+ and eventual_encoding is not None
+ ):
+ val = val.encode(eventual_encoding)
+
+ text = formatter.attribute_value(val)
+ decoded = (
+ unicode(key) + '='
+ + EntitySubstitution.quoted_attribute_value(text))
+ attrs.append(decoded)
close = ''
closeTag = ''
@@ -1192,9 +1152,7 @@ class Tag(PageElement):
prefix = self.prefix + ":"
if self.is_empty_element:
- close = ''
- if isinstance(formatter, Formatter):
- close = formatter.void_element_close_prefix or close
+ close = formatter.void_element_close_prefix or ''
else:
closeTag = '</%s%s>' % (prefix, self.name)
@@ -1241,6 +1199,13 @@ class Tag(PageElement):
s = ''.join(s)
return s
+ def _should_pretty_print(self, indent_level):
+ """Should this tag be pretty-printed?"""
+ return (
+ indent_level is not None
+ and self.name not in self.preserve_whitespace_tags
+ )
+
def prettify(self, encoding=None, formatter="minimal"):
if encoding is None:
return self.decode(True, formatter=formatter)
@@ -1267,8 +1232,8 @@ class Tag(PageElement):
"""
# First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again.
- if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable):
- formatter = self._formatter_for_name(formatter)
+ if not isinstance(formatter, Formatter):
+ formatter = self.formatter_for_name(formatter)
pretty_print = (indent_level is not None)
s = []
@@ -1289,6 +1254,19 @@ class Tag(PageElement):
s.append("\n")
return ''.join(s)
+ def formatter_for_name(self, formatter):
+ if isinstance(formatter, Formatter):
+ return formatter
+ if self._is_xml:
+ c = XMLFormatter
+ else:
+ c = HTMLFormatter
+ if callable(formatter):
+ formatter = c(entity_substitution=formatter)
+ formatter.custom = True
+ return formatter
+ return c.REGISTRY[formatter]
+
def encode_contents(
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index f7c5e2f..ffbc29e 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -24,8 +24,8 @@ from bs4.element import (
CData,
Comment,
Declaration,
- MinimalHTMLFormatter,
Doctype,
+ HTMLFormatter,
NavigableString,
SoupStrainer,
Tag,
@@ -1533,7 +1533,7 @@ class TestSubstitutions(SoupTest):
# callable is called on every string.
self.assertEqual(
decoded,
- self.document_for(u"<b><FOO></b><b>BAR</b><br>"))
+ self.document_for(u"<b><FOO></b><b>BAR</b><br/>"))
def test_formatter_is_run_on_attribute_values(self):
markup = u'<a href="http://a.com?a=b&c=é">e</a>'
@@ -1687,10 +1687,10 @@ class TestEncoding(SoupTest):
class TestFormatter(SoupTest):
def test_sort_attributes(self):
- class UnsortedFormatter(MinimalHTMLFormatter):
- def sort_attributes(self, attributes):
- self.called_with = attributes
- for k, v in sorted(attributes.items()):
+ class UnsortedFormatter(HTMLFormatter):
+ def attributes(self, tag):
+ self.called_with = tag
+ for k, v in sorted(tag.attrs.items()):
if k == 'ignore':
continue
yield k,v
@@ -1699,9 +1699,9 @@ class TestFormatter(SoupTest):
formatter = UnsortedFormatter()
decoded = soup.decode(formatter=formatter)
- # sort_attributes() was called with all three attributes. It removed one and
- # sorted the other two.
- self.assertEquals(formatter.called_with, dict(cval="1", aval="2", ignore="ignored"))
+ # sort_attributes() was called on the <p> tag. It filtered out one
+ # attribute and sorted the other two.
+ self.assertEquals(formatter.called_with, soup.p)
self.assertEquals(u'<p aval="2" cval="1"></p>', decoded)