summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bs4/__init__.py6
-rw-r--r--bs4/element.py49
-rw-r--r--bs4/tests/test_tree.py56
3 files changed, 89 insertions, 22 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index ee68e25..21e5d6c 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -297,9 +297,10 @@ class BeautifulSoup(Tag):
def decode(self, pretty_print=False,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
- substitute_html_entities=False):
+ formatter="minimal"):
"""Returns a string or Unicode representation of this document.
To get Unicode, pass None for encoding."""
+
if self.is_xml:
# Print the XML declaration
encoding_part = ''
@@ -313,8 +314,7 @@ class BeautifulSoup(Tag):
else:
indent_level = 0
return prefix + super(BeautifulSoup, self).decode(
- indent_level, eventual_encoding,
- substitute_html_entities)
+ indent_level, eventual_encoding, formatter)
class StopParsing(Exception):
diff --git a/bs4/element.py b/bs4/element.py
index 7c72894..b176777 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -29,6 +29,24 @@ class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
+ # There are five possible values for the "formatter" argument passed in
+ # to methods like encode() and prettify():
+ #
+ # "html" - All Unicode characters with corresponding HTML entities
+ # are converted to those entities on output.
+ # "minimal" - Bare ampersands and angle brackets are converted to
+ # XML entities: & < >
+ # None - The null formatter. Unicode characters are never
+ # converted to entities. This is not recommended, but it's
+ # faster than "minimal".
+ # A function - This function will be called on every string that
+ # needs to undergo entity substition
+ FORMATTERS = {
+ "html" : EntitySubstitution.substitute_html,
+ "minimal" : EntitySubstitution.substitute_xml,
+ None : None
+ }
+
def setup(self, parent=None, previous_element=None):
"""Sets up the initial relations between this element and
other elements."""
@@ -396,11 +414,15 @@ class NavigableString(unicode, PageElement):
"'%s' object has no attribute '%s'" % (
self.__class__.__name__, attr))
- def output_ready(self, substitute_html_entities=False):
- if substitute_html_entities:
- output = EntitySubstitution.substitute_html(self)
+ def output_ready(self, formatter="minimal"):
+ if not callable(formatter):
+ formatter = self.FORMATTERS.get(
+ formatter, EntitySubstitution.substitute_xml)
+ if formatter is None:
+ output = self
else:
- output = EntitySubstitution.substitute_xml(self)
+ output = formatter(self)
+
return self.PREFIX + output + self.SUFFIX
@@ -673,13 +695,13 @@ class Tag(PageElement):
__str__ = __repr__ = __unicode__
def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
- indent_level=None, substitute_html_entities=False):
+ indent_level=None, formatter="minimal"):
return self.decode(indent_level, encoding,
- substitute_html_entities).encode(encoding)
+ formatter).encode(encoding)
def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
- substitute_html_entities=False):
+ formatter="minimal"):
"""Returns a Unicode representation of this tag and its contents.
:param eventual_encoding: The tag is destined to be
@@ -720,7 +742,7 @@ class Tag(PageElement):
space = ''
indent_contents = None
contents = self.decode_contents(
- indent_contents, eventual_encoding, substitute_html_entities)
+ indent_contents, eventual_encoding, formatter)
if self.hidden:
# This is the 'document root' object.
@@ -746,12 +768,13 @@ class Tag(PageElement):
s = ''.join(s)
return s
- def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
- return self.encode(encoding, True)
+ def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ formatter="minimal"):
+ return self.encode(encoding, True, formatter)
def decode_contents(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
- substitute_html_entities=False):
+ formatter="minimal"):
"""Renders the contents of this tag as a Unicode string.
:param eventual_encoding: The tag is destined to be
@@ -766,10 +789,10 @@ class Tag(PageElement):
for c in self:
text = None
if isinstance(c, NavigableString):
- text = c.output_ready(substitute_html_entities)
+ text = c.output_ready(formatter)
elif isinstance(c, Tag):
s.append(c.decode(indent_level, eventual_encoding,
- substitute_html_entities))
+ formatter))
if text and indent_level:
text = text.strip()
if text:
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 82a3bfa..5552347 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -934,12 +934,57 @@ class TestPersistence(SoupTest):
class TestSubstitutions(SoupTest):
- def test_html_entity_substitution(self):
- soup = self.soup(
- u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>")
- decoded = soup.decode(substitute_html_entities=True)
+ def test_default_formatter_is_minimal(self):
+ markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
+ soup = self.soup(markup)
+ decoded = soup.decode(formatter="minimal")
+ # The < is converted back into &lt; but the e-with-acute is left alone.
+ self.assertEqual(
+ decoded,
+ self.document_for(
+ u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
+
+ def test_formatter_html(self):
+ markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
+ soup = self.soup(markup)
+ decoded = soup.decode(formatter="html")
+ self.assertEqual(
+ decoded,
+ self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
+
+ def test_formatter_minimal(self):
+ markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
+ soup = self.soup(markup)
+ decoded = soup.decode(formatter="minimal")
+ # The < is converted back into &lt; but the e-with-acute is left alone.
+ self.assertEqual(
+ decoded,
+ self.document_for(
+ u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
+
+ def test_formatter_null(self):
+ markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
+ soup = self.soup(markup)
+ decoded = soup.decode(formatter=None)
+ # Neither the angle brackets nor the e-with-acute are converted.
+ # This is not valid HTML, but it's what the user wanted.
self.assertEqual(decoded,
- self.document_for("<b>Sacr&eacute; bleu!</b>"))
+ self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
+
+ def test_formatter_custom(self):
+ markup = u"<b>&lt;foo&gt;</b><b>bar</b>"
+ soup = self.soup(markup)
+ decoded = soup.decode(formatter = lambda x: x.upper())
+ # Instead of normal entity conversion code, the custom
+ # callable is called on every string.
+ self.assertEqual(
+ decoded,
+ self.document_for(u"<b><FOO></b><b>BAR</b>"))
+
+ def test_prettify_accepts_formatter(self):
+ soup = BeautifulSoup("<html><body>foo</body></html>")
+ pretty = soup.prettify(formatter = lambda x: x.upper())
+ self.assertTrue(b"FOO" in pretty)
def test_html_entity_substitution_off_by_default(self):
markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
@@ -984,7 +1029,6 @@ class TestSubstitutions(SoupTest):
soup = self.soup(markup, parse_only=strainer)
self.assertEqual(soup.contents[0].name, 'pre')
-
class TestEncoding(SoupTest):
"""Test the ability to encode objects into strings."""