diff options
-rw-r--r-- | bs4/__init__.py | 6 | ||||
-rw-r--r-- | bs4/element.py | 49 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 56 |
3 files changed, 89 insertions, 22 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index ee68e25..21e5d6c 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -297,9 +297,10 @@ class BeautifulSoup(Tag): def decode(self, pretty_print=False, eventual_encoding=DEFAULT_OUTPUT_ENCODING, - substitute_html_entities=False): + formatter="minimal"): """Returns a string or Unicode representation of this document. To get Unicode, pass None for encoding.""" + if self.is_xml: # Print the XML declaration encoding_part = '' @@ -313,8 +314,7 @@ class BeautifulSoup(Tag): else: indent_level = 0 return prefix + super(BeautifulSoup, self).decode( - indent_level, eventual_encoding, - substitute_html_entities) + indent_level, eventual_encoding, formatter) class StopParsing(Exception): diff --git a/bs4/element.py b/bs4/element.py index 7c72894..b176777 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -29,6 +29,24 @@ class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" + # There are five possible values for the "formatter" argument passed in + # to methods like encode() and prettify(): + # + # "html" - All Unicode characters with corresponding HTML entities + # are converted to those entities on output. + # "minimal" - Bare ampersands and angle brackets are converted to + # XML entities: & < > + # None - The null formatter. Unicode characters are never + # converted to entities. This is not recommended, but it's + # faster than "minimal". + # A function - This function will be called on every string that + # needs to undergo entity substition + FORMATTERS = { + "html" : EntitySubstitution.substitute_html, + "minimal" : EntitySubstitution.substitute_xml, + None : None + } + def setup(self, parent=None, previous_element=None): """Sets up the initial relations between this element and other elements.""" @@ -396,11 +414,15 @@ class NavigableString(unicode, PageElement): "'%s' object has no attribute '%s'" % ( self.__class__.__name__, attr)) - def output_ready(self, substitute_html_entities=False): - if substitute_html_entities: - output = EntitySubstitution.substitute_html(self) + def output_ready(self, formatter="minimal"): + if not callable(formatter): + formatter = self.FORMATTERS.get( + formatter, EntitySubstitution.substitute_xml) + if formatter is None: + output = self else: - output = EntitySubstitution.substitute_xml(self) + output = formatter(self) + return self.PREFIX + output + self.SUFFIX @@ -673,13 +695,13 @@ class Tag(PageElement): __str__ = __repr__ = __unicode__ def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, - indent_level=None, substitute_html_entities=False): + indent_level=None, formatter="minimal"): return self.decode(indent_level, encoding, - substitute_html_entities).encode(encoding) + formatter).encode(encoding) def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, - substitute_html_entities=False): + formatter="minimal"): """Returns a Unicode representation of this tag and its contents. :param eventual_encoding: The tag is destined to be @@ -720,7 +742,7 @@ class Tag(PageElement): space = '' indent_contents = None contents = self.decode_contents( - indent_contents, eventual_encoding, substitute_html_entities) + indent_contents, eventual_encoding, formatter) if self.hidden: # This is the 'document root' object. @@ -746,12 +768,13 @@ class Tag(PageElement): s = ''.join(s) return s - def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): - return self.encode(encoding, True) + def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + return self.encode(encoding, True, formatter) def decode_contents(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, - substitute_html_entities=False): + formatter="minimal"): """Renders the contents of this tag as a Unicode string. :param eventual_encoding: The tag is destined to be @@ -766,10 +789,10 @@ class Tag(PageElement): for c in self: text = None if isinstance(c, NavigableString): - text = c.output_ready(substitute_html_entities) + text = c.output_ready(formatter) elif isinstance(c, Tag): s.append(c.decode(indent_level, eventual_encoding, - substitute_html_entities)) + formatter)) if text and indent_level: text = text.strip() if text: diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 82a3bfa..5552347 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -934,12 +934,57 @@ class TestPersistence(SoupTest): class TestSubstitutions(SoupTest): - def test_html_entity_substitution(self): - soup = self.soup( - u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>") - decoded = soup.decode(substitute_html_entities=True) + def test_default_formatter_is_minimal(self): + markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + self.assertEqual( + decoded, + self.document_for( + u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) + + def test_formatter_html(self): + markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter="html") + self.assertEqual( + decoded, + self.document_for("<b><<Sacré bleu!>></b>")) + + def test_formatter_minimal(self): + markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + self.assertEqual( + decoded, + self.document_for( + u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) + + def test_formatter_null(self): + markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter=None) + # Neither the angle brackets nor the e-with-acute are converted. + # This is not valid HTML, but it's what the user wanted. self.assertEqual(decoded, - self.document_for("<b>Sacré bleu!</b>")) + self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) + + def test_formatter_custom(self): + markup = u"<b><foo></b><b>bar</b>" + soup = self.soup(markup) + decoded = soup.decode(formatter = lambda x: x.upper()) + # Instead of normal entity conversion code, the custom + # callable is called on every string. + self.assertEqual( + decoded, + self.document_for(u"<b><FOO></b><b>BAR</b>")) + + def test_prettify_accepts_formatter(self): + soup = BeautifulSoup("<html><body>foo</body></html>") + pretty = soup.prettify(formatter = lambda x: x.upper()) + self.assertTrue(b"FOO" in pretty) def test_html_entity_substitution_off_by_default(self): markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" @@ -984,7 +1029,6 @@ class TestSubstitutions(SoupTest): soup = self.soup(markup, parse_only=strainer) self.assertEqual(soup.contents[0].name, 'pre') - class TestEncoding(SoupTest): """Test the ability to encode objects into strings.""" |