diff options
-rw-r--r-- | beautifulsoup/__init__.py | 6 | ||||
-rw-r--r-- | beautifulsoup/dammit.py | 4 | ||||
-rw-r--r-- | beautifulsoup/element.py | 27 | ||||
-rw-r--r-- | tests/test_tree.py | 13 |
4 files changed, 38 insertions, 12 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py index cee55e7..f4c2a95 100644 --- a/beautifulsoup/__init__.py +++ b/beautifulsoup/__init__.py @@ -263,7 +263,8 @@ class BeautifulSoup(Tag): self.currentData.append(data) def decode(self, pretty_print=False, indent_level=0, - eventual_encoding=DEFAULT_OUTPUT_ENCODING): + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + replace_with_html_entities=False): """Returns a string or Unicode representation of this document. To get Unicode, pass None for encoding.""" if self.is_xml: @@ -275,7 +276,8 @@ class BeautifulSoup(Tag): else: prefix = u'' return prefix + super(BeautifulSoup, self).decode( - pretty_print, indent_level, eventual_encoding) + pretty_print, indent_level, eventual_encoding, + replace_with_html_entities) class StopParsing(Exception): diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py index 9833bd4..31dfa95 100644 --- a/beautifulsoup/dammit.py +++ b/beautifulsoup/dammit.py @@ -37,8 +37,8 @@ class EntitySubstitution(object): for codepoint, name in codepoint2name.items(): if codepoint == 34: # There's no point in turning the quotation mark into - # ", unless it happens in an attribute value, which - # is done elsewhere. + # ", unless it happens within an attribute value, which + # is handled elsewhere. continue; character = unichr(codepoint) characters.append(character) diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index 23f8c33..f3a59d4 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -561,11 +561,14 @@ class Tag(PageElement, EntitySubstitution): return self.encode() def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, - pretty_print=False, indent_level=0): - return self.decode(pretty_print, indent_level, encoding).encode(encoding) + pretty_print=False, indent_level=0, + replace_with_html_entities=False): + return self.decode(pretty_print, indent_level, encoding, + replace_with_html_entities).encode(encoding) def decode(self, pretty_print=False, indent_level=0, - eventual_encoding=DEFAULT_OUTPUT_ENCODING): + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + replace_with_html_entities=False): """Returns a string or Unicode representation of this tag and its contents. To get Unicode, pass None for encoding.""" @@ -597,7 +600,8 @@ class Tag(PageElement, EntitySubstitution): space = (' ' * (indentTag-1)) indentContents = indentTag + 1 contents = self.decodeContents(pretty_print, indentContents, - eventual_encoding) + eventual_encoding, + replace_with_html_entities) if self.hidden: s = contents else: @@ -635,11 +639,15 @@ class Tag(PageElement, EntitySubstitution): return self.encode(encoding, True) def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING, - pretty_print=False, indent_level=0): - return self.decodeContents(pretty_print, indent_level).encode(encoding) + pretty_print=False, indent_level=0, + replace_With_html_entities=False): + return self.decodeContents( + pretty_print, indent_level, replace_with_html_entities).encode( + encoding) def decodeContents(self, pretty_print=False, indent_level=0, - eventual_encoding=DEFAULT_OUTPUT_ENCODING): + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + replace_with_html_entities=False): """Renders the contents of this tag as a string in the given encoding. If encoding is None, returns a Unicode string..""" s=[] @@ -648,10 +656,13 @@ class Tag(PageElement, EntitySubstitution): if isinstance(c, NavigableString): text = c.decodeGivenEventualEncoding(eventual_encoding) elif isinstance(c, Tag): - s.append(c.decode(pretty_print, indent_level, eventual_encoding)) + s.append(c.decode(pretty_print, indent_level, eventual_encoding, + replace_with_html_entities)) if text and pretty_print: text = text.strip() if text: + if replace_with_html_entities: + text = self.substitute_html(text) if pretty_print: s.append(" " * (indent_level-1)) s.append(text) diff --git a/tests/test_tree.py b/tests/test_tree.py index 0b3d72e..249e7ae 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -830,6 +830,19 @@ class TestPersistence(SoupTest): class TestSubstitutions(SoupTest): + def test_entity_substitution(self): + soup = self.soup( + u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>") + encoded = soup.encode("utf-8", replace_with_html_entities=True) + self.assertEquals(encoded, + self.document_for("<b>Sacré bleu!</b>")) + + def test_entity_substitution_off_by_default(self): + markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" + soup = self.soup(markup) + encoded = soup.b.encode("utf-8") + self.assertEquals(encoded, markup.encode('utf-8')) + def test_encoding_substitution(self): # Here's the <meta> tag saying that a document is # encoded in Shift-JIS. |