summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bs4/element.py221
-rw-r--r--bs4/formatter.py99
-rw-r--r--bs4/tests/test_tree.py4
-rw-r--r--doc/source/index.rst58
4 files changed, 198 insertions, 184 deletions
diff --git a/bs4/element.py b/bs4/element.py
index a233dcd..c4b5bc7 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -16,7 +16,11 @@ except ImportError, e:
'The soupsieve package is not installed. CSS selectors cannot be used.'
)
-from bs4.dammit import EntitySubstitution
+from bs4.formatter import (
+ Formatter,
+ HTMLFormatter,
+ XMLFormatter,
+)
DEFAULT_OUTPUT_ENCODING = "utf-8"
PY3K = (sys.version_info[0] > 2)
@@ -99,123 +103,71 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value)
+
+class PageElement(object):
+ """Contains the navigational information for some part of the page
+ (either a tag or a piece of text)"""
+
+ def setup(self, parent=None, previous_element=None, next_element=None,
+ previous_sibling=None, next_sibling=None):
+ """Sets up the initial relations between this element and
+ other elements."""
+ self.parent = parent
-class Formatter(EntitySubstitution):
- """Describes a strategy to use when outputting a parse tree to a string.
+ self.previous_element = previous_element
+ if previous_element is not None:
+ self.previous_element.next_element = self
- Some parts of this strategy come from the distinction between
- HTML4, HTML5, and XML. Others are configurable by the user.
- """
- # Registries of XML and HTML formatters.
- XML_FORMATTERS = {}
- HTML_FORMATTERS = {}
+ self.next_element = next_element
+ if self.next_element is not None:
+ self.next_element.previous_element = self
- HTML = 'html'
- XML = 'xml'
+ self.next_sibling = next_sibling
+ if self.next_sibling is not None:
+ self.next_sibling.previous_sibling = self
- HTML_DEFAULTS = dict(
- cdata_containing_tags=set(["script", "style"]),
- preformatted_tags=set(["pre"]),
- )
+ if (previous_sibling is None
+ and self.parent is not None and self.parent.contents):
+ previous_sibling = self.parent.contents[-1]
- def _default(self, language, value, kwarg):
- if value is not None:
- return value
- if language == self.XML:
- return set()
- return self.HTML_DEFAULTS[kwarg]
-
- def __init__(
- self, language=None, entity_substitution=None,
- void_element_close_prefix='/', cdata_containing_tags=None,
- preformatted_tags=None,
- ):
- """
+ self.previous_sibling = previous_sibling
+ if previous_sibling is not None:
+ self.previous_sibling.next_sibling = self
- :param void_element_close_prefix: By default, represent void
- elements as <tag/> rather than <tag>
- """
- self.language = language
- self.entity_substitution = entity_substitution
- self.void_element_close_prefix = void_element_close_prefix
- self.cdata_containing_tags = self._default(
- language, cdata_containing_tags, 'cdata_containing_tags'
- )
-
- def substitute(self, ns):
- """Process a string that needs to undergo entity substitution."""
- if not self.entity_substitution:
- return ns
- if (isinstance(ns, NavigableString)
- and ns.parent is not None
- and ns.parent.name in self.cdata_containing_tags):
- # Do nothing.
- return ns
- # Substitute.
- return self.entity_substitution(ns)
-
- def attribute_value(self, value):
- """Process the value of an attribute."""
- return self.substitute(value)
-
- def attributes(self, tag):
- """Reorder a tag's attributes however you want."""
- return sorted(tag.attrs.items())
-
-class HTMLFormatter(Formatter):
- REGISTRY = {}
- def __init__(self, *args, **kwargs):
- return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
-
-class XMLFormatter(Formatter):
- REGISTRY = {}
- def __init__(self, *args, **kwargs):
- return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
-
-# Set up aliases for the default formatters.
-HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
- entity_substitution=EntitySubstitution.substitute_html
-)
-HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
- entity_substitution=EntitySubstitution.substitute_html,
- void_element_close_prefix = None
-)
-HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
- entity_substitution=EntitySubstitution.substitute_xml
-)
-HTMLFormatter.REGISTRY[None] = HTMLFormatter(
- entity_substitution=None
-)
-XMLFormatter.REGISTRY["html"] = XMLFormatter(
- entity_substitution=EntitySubstitution.substitute_html
-)
-XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
- entity_substitution=EntitySubstitution.substitute_xml
-)
-XMLFormatter.REGISTRY[None] = Formatter(
- Formatter(Formatter.XML, entity_substitution=None)
-)
-
-class PageElement(object):
- """Contains the navigational information for some part of the page
- (either a tag or a piece of text)"""
-
def format_string(self, s, formatter):
"""Format the given string using the given formatter."""
if formatter is None:
return s
if not isinstance(formatter, Formatter):
- formatter = self.formatter_by_name(formatter)
+ formatter = self.formatter_for_name(formatter)
output = formatter.substitute(s)
return output
+ def formatter_for_name(self, formatter):
+ """Look up or create a Formatter for the given identifier,
+ if necessary.
+
+ :param formatter: Can be a Formatter object (used as-is), a
+ function (used as the entity substitution hook for an
+ XMLFormatter or HTMLFormatter), or a string (used to look up
+ an XMLFormatter or HTMLFormatter in the appropriate registry.
+ """
+ if isinstance(formatter, Formatter):
+ return formatter
+ if self._is_xml:
+ c = XMLFormatter
+ else:
+ c = HTMLFormatter
+ if callable(formatter):
+ return c(entity_substitution=formatter)
+ return c.REGISTRY[formatter]
+
@property
def _is_xml(self):
"""Is this element part of an XML tree or an HTML tree?
- This is used when mapping a formatter name ("minimal") to an
- appropriate function (one that performs entity-substitution on
- the contents of <script> and <style> tags, or not). It can be
+ This is used in formatter_for_name, when deciding whether an
+ XMLFormatter or HTMLFormatter is more appropriate. It can be
inefficient, but it should be called very rarely.
"""
if self.known_xml is not None:
@@ -233,32 +185,6 @@ class PageElement(object):
return getattr(self, 'is_xml', False)
return self.parent._is_xml
- def setup(self, parent=None, previous_element=None, next_element=None,
- previous_sibling=None, next_sibling=None):
- """Sets up the initial relations between this element and
- other elements."""
- self.parent = parent
-
- self.previous_element = previous_element
- if previous_element is not None:
- self.previous_element.next_element = self
-
- self.next_element = next_element
- if self.next_element is not None:
- self.next_element.previous_element = self
-
- self.next_sibling = next_sibling
- if self.next_sibling is not None:
- self.next_sibling.previous_sibling = self
-
- if (previous_sibling is None
- and self.parent is not None and self.parent.contents):
- previous_sibling = self.parent.contents[-1]
-
- self.previous_sibling = previous_sibling
- if previous_sibling is not None:
- self.previous_sibling.next_sibling = self
-
nextSibling = _alias("next_sibling") # BS3
previousSibling = _alias("previous_sibling") # BS3
@@ -720,6 +646,7 @@ class NavigableString(unicode, PageElement):
self.__class__.__name__, attr))
def output_ready(self, formatter="minimal"):
+ """Run the string through the provided formatter."""
output = self.format_string(self, formatter)
return self.PREFIX + output + self.SUFFIX
@@ -742,8 +669,8 @@ class PreformattedString(NavigableString):
"""CData strings are passed into the formatter, purely
for any side effects. The return value is ignored.
"""
- if formatter:
- self.format_string(self, formatter)
+ if formatter is not None:
+ ignore = self.format_string(self, formatter)
return self.PREFIX + self + self.SUFFIX
class CData(PreformattedString):
@@ -1119,8 +1046,9 @@ class Tag(PageElement):
encoding.
"""
- # First off, turn a string formatter into a Formatter object. This
- # will stop the lookup from happening over and over again.
+ # First off, turn a non-Formatter `formatter` into a Formatter
+ # object. This will stop the lookup from happening over and
+ # over again.
if not isinstance(formatter, Formatter):
formatter = self.formatter_for_name(formatter)
attributes = formatter.attributes(self)
@@ -1142,7 +1070,7 @@ class Tag(PageElement):
text = formatter.attribute_value(val)
decoded = (
unicode(key) + '='
- + EntitySubstitution.quoted_attribute_value(text))
+ + formatter.quoted_attribute_value(text))
attrs.append(decoded)
close = ''
closeTag = ''
@@ -1167,7 +1095,8 @@ class Tag(PageElement):
else:
indent_contents = None
contents = self.decode_contents(
- indent_contents, eventual_encoding, formatter)
+ indent_contents, eventual_encoding, formatter
+ )
if self.hidden:
# This is the 'document root' object.
@@ -1221,14 +1150,14 @@ class Tag(PageElement):
indented this many spaces.
:param eventual_encoding: The tag is destined to be
- encoded into this encoding. This method is _not_
+ encoded into this encoding. decode_contents() is _not_
responsible for performing that encoding. This information
is passed in so that it can be substituted in if the
document contains a <META> tag that mentions the document's
encoding.
- :param formatter: The output formatter responsible for converting
- entities to Unicode characters.
+ :param formatter: A Formatter object, or a string naming one of
+ the standard Formatters.
"""
# First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again.
@@ -1244,29 +1173,17 @@ class Tag(PageElement):
elif isinstance(c, Tag):
s.append(c.decode(indent_level, eventual_encoding,
formatter))
- if text and indent_level and not self.name == 'pre':
+ preserve_whitespace = self.name in self.preserve_whitespace_tags
+ if text and indent_level and not preserve_whitespace:
text = text.strip()
if text:
- if pretty_print and not self.name == 'pre':
+ if pretty_print and not preserve_whitespace:
s.append(" " * (indent_level - 1))
s.append(text)
- if pretty_print and not self.name == 'pre':
+ if pretty_print and not preserve_whitespace:
s.append("\n")
return ''.join(s)
-
- def formatter_for_name(self, formatter):
- if isinstance(formatter, Formatter):
- return formatter
- if self._is_xml:
- c = XMLFormatter
- else:
- c = HTMLFormatter
- if callable(formatter):
- formatter = c(entity_substitution=formatter)
- formatter.custom = True
- return formatter
- return c.REGISTRY[formatter]
-
+
def encode_contents(
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
diff --git a/bs4/formatter.py b/bs4/formatter.py
new file mode 100644
index 0000000..f2724db
--- /dev/null
+++ b/bs4/formatter.py
@@ -0,0 +1,99 @@
+from bs4.dammit import EntitySubstitution
+
+class Formatter(EntitySubstitution):
+ """Describes a strategy to use when outputting a parse tree to a string.
+
+ Some parts of this strategy come from the distinction between
+ HTML4, HTML5, and XML. Others are configurable by the user.
+ """
+ # Registries of XML and HTML formatters.
+ XML_FORMATTERS = {}
+ HTML_FORMATTERS = {}
+
+ HTML = 'html'
+ XML = 'xml'
+
+ HTML_DEFAULTS = dict(
+ cdata_containing_tags=set(["script", "style"]),
+ )
+
+ def _default(self, language, value, kwarg):
+ if value is not None:
+ return value
+ if language == self.XML:
+ return set()
+ return self.HTML_DEFAULTS[kwarg]
+
+ def __init__(
+ self, language=None, entity_substitution=None,
+ void_element_close_prefix='/', cdata_containing_tags=None,
+ ):
+ """
+
+ :param void_element_close_prefix: By default, represent void
+ elements as <tag/> rather than <tag>
+ """
+ self.language = language
+ self.entity_substitution = entity_substitution
+ self.void_element_close_prefix = void_element_close_prefix
+ self.cdata_containing_tags = self._default(
+ language, cdata_containing_tags, 'cdata_containing_tags'
+ )
+
+ def substitute(self, ns):
+ """Process a string that needs to undergo entity substitution."""
+ if not self.entity_substitution:
+ return ns
+ from element import NavigableString
+ if (isinstance(ns, NavigableString)
+ and ns.parent is not None
+ and ns.parent.name in self.cdata_containing_tags):
+ # Do nothing.
+ return ns
+ # Substitute.
+ return self.entity_substitution(ns)
+
+ def attribute_value(self, value):
+ """Process the value of an attribute."""
+ return self.substitute(value)
+
+ def attributes(self, tag):
+ """Reorder a tag's attributes however you want."""
+ return sorted(tag.attrs.items())
+
+
+class HTMLFormatter(Formatter):
+ REGISTRY = {}
+ def __init__(self, *args, **kwargs):
+ return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
+
+
+class XMLFormatter(Formatter):
+ REGISTRY = {}
+ def __init__(self, *args, **kwargs):
+ return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
+
+
+# Set up aliases for the default formatters.
+HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
+ entity_substitution=EntitySubstitution.substitute_html
+)
+HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
+ entity_substitution=EntitySubstitution.substitute_html,
+ void_element_close_prefix = None
+)
+HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
+ entity_substitution=EntitySubstitution.substitute_xml
+)
+HTMLFormatter.REGISTRY[None] = HTMLFormatter(
+ entity_substitution=None
+)
+XMLFormatter.REGISTRY["html"] = XMLFormatter(
+ entity_substitution=EntitySubstitution.substitute_html
+)
+XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
+ entity_substitution=EntitySubstitution.substitute_xml
+)
+XMLFormatter.REGISTRY[None] = Formatter(
+ Formatter(Formatter.XML, entity_substitution=None)
+)
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index c1cd581..6510f85 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1571,11 +1571,11 @@ class TestSubstitutions(SoupTest):
self.assertTrue(b"< < hey > >" in encoded)
def test_prettify_leaves_preformatted_text_alone(self):
- soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ")
+ soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>")
# Everything outside the <pre> tag is reformatted, but everything
# inside is left alone.
self.assertEqual(
- u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
+ u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>',
soup.div.prettify())
def test_prettify_accepts_formatter_function(self):
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 8376549..0c09964 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2264,16 +2264,17 @@ to Beautiful Soup generating invalid HTML/XML, as in these examples::
print(link_soup.a.encode(formatter=None))
# <a href="http://example.com/?foo=val1&bar=val2">A link</a>
-Finally, if you pass in a function for ``formatter``, Beautiful Soup
-will call that function once for every string and attribute value in
-the document. You can do whatever you want in this function. Here's a
-formatter that converts strings to uppercase and does absolutely
-nothing else::
+If you need more sophisticated control over your output, you can
+use Beautiful Soup's ``Formatter`` class. Here's a formatter that
+converts strings to uppercase, whether they occur in a text node or in an
+attribute value::
+ from bs4.formatter import HTMLFormatter
def uppercase(str):
return str.upper()
+ formatter = HTMLFormatter(uppercase)
- print(soup.prettify(formatter=uppercase))
+ print(soup.prettify(formatter=formatter))
# <html>
# <body>
# <p>
@@ -2282,34 +2283,31 @@ nothing else::
# </body>
# </html>
- print(link_soup.a.prettify(formatter=uppercase))
+ print(link_soup.a.prettify(formatter=formatter))
# <a href="HTTP://EXAMPLE.COM/?FOO=VAL1&BAR=VAL2">
# A LINK
# </a>
-If you're writing your own function, you should know about the
-``EntitySubstitution`` class in the ``bs4.dammit`` module. This class
-implements Beautiful Soup's standard formatters as class methods: the
-"html" formatter is ``EntitySubstitution.substitute_html``, and the
-"minimal" formatter is ``EntitySubstitution.substitute_xml``. You can
-use these functions to simulate ``formatter=html`` or
-``formatter==minimal``, but then do something extra.
-
-Here's an example that replaces Unicode characters with HTML entities
-whenever possible, but `also` converts all strings to uppercase::
-
- from bs4.dammit import EntitySubstitution
- def uppercase_and_substitute_html_entities(str):
- return EntitySubstitution.substitute_html(str.upper())
-
- print(soup.prettify(formatter=uppercase_and_substitute_html_entities))
- # <html>
- # <body>
- # <p>
- # IL A DIT &lt;&lt;SACR&Eacute; BLEU!&gt;&gt;
- # </p>
- # </body>
- # </html>
+Subclassing ``HTMLFormatter`` or ``XMLFormatter`` will give you even
+more control over the output. For example, Beautiful Soup sorts the
+attributes in every tag by default::
+
+ attr_soup = BeautifulSoup('<p z="1" m="2" a="3"></p>')
+ print(attr_soup.p.encode())
+ # <p a="3" m="2" z="1"></p>
+
+To turn this off, you can subclass the ``Formatter.attributes()``
+method, which controls which attributes are output and in what
+order. This implementation also filters out out one of the attributes.
+
+ class UnsortedAttributes(HTMLFormatter):
+ def attributes(self, tag):
+ for k, v in tag.attrs.items():
+ if k == 'm':
+ continue
+ yield k, v
+ print(attr_soup.p.encode(formatter=UnsortedAttributes()))
+ # <p z="1" a="3"></p>
One last caveat: if you create a ``CData`` object, the text inside
that object is always presented `exactly as it appears, with no