First stab at HTML entity replacement.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-26 21:26:15 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-26 21:26:15 -0500
commit: d7056f49c8bb3a448cec2f1a6f2de55e93c8e8d6 (patch)
tree: cc75bdadec0060288b1b76213c3ac2de34d9ab68
parent: ef5770589595e80cbd9690b64504a2166b3558fd (diff)
4 files changed, 38 insertions, 12 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index cee55e7..f4c2a95 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -263,7 +263,8 @@ class BeautifulSoup(Tag):
         self.currentData.append(data)
 
     def decode(self, pretty_print=False, indent_level=0,
-               eventual_encoding=DEFAULT_OUTPUT_ENCODING):
+               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+               replace_with_html_entities=False):
         """Returns a string or Unicode representation of this document.
         To get Unicode, pass None for encoding."""
         if self.is_xml:
@@ -275,7 +276,8 @@ class BeautifulSoup(Tag):
         else:
             prefix = u''
         return prefix + super(BeautifulSoup, self).decode(
-            pretty_print, indent_level, eventual_encoding)
+            pretty_print, indent_level, eventual_encoding,
+            replace_with_html_entities)
 
 
 class StopParsing(Exception):
diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py
index 9833bd4..31dfa95 100644
--- a/beautifulsoup/dammit.py
+++ b/beautifulsoup/dammit.py
@@ -37,8 +37,8 @@ class EntitySubstitution(object):
         for codepoint, name in codepoint2name.items():
             if codepoint == 34:
                 # There's no point in turning the quotation mark into
-                # &quot;, unless it happens in an attribute value, which
-                # is done elsewhere.
+                # &quot;, unless it happens within an attribute value, which
+                # is handled elsewhere.
                 continue;
             character = unichr(codepoint)
             characters.append(character)
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index 23f8c33..f3a59d4 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -561,11 +561,14 @@ class Tag(PageElement, EntitySubstitution):
         return self.encode()
 
     def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
-               pretty_print=False, indent_level=0):
-        return self.decode(pretty_print, indent_level, encoding).encode(encoding)
+               pretty_print=False, indent_level=0,
+               replace_with_html_entities=False):
+        return self.decode(pretty_print, indent_level, encoding,
+                           replace_with_html_entities).encode(encoding)
 
     def decode(self, pretty_print=False, indent_level=0,
-               eventual_encoding=DEFAULT_OUTPUT_ENCODING):
+               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+               replace_with_html_entities=False):
         """Returns a string or Unicode representation of this tag and
         its contents. To get Unicode, pass None for encoding."""
 
@@ -597,7 +600,8 @@ class Tag(PageElement, EntitySubstitution):
             space = (' ' * (indentTag-1))
             indentContents = indentTag + 1
         contents = self.decodeContents(pretty_print, indentContents,
-                                       eventual_encoding)
+                                       eventual_encoding,
+                                       replace_with_html_entities)
         if self.hidden:
             s = contents
         else:
@@ -635,11 +639,15 @@ class Tag(PageElement, EntitySubstitution):
         return self.encode(encoding, True)
 
     def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
-                       pretty_print=False, indent_level=0):
-        return self.decodeContents(pretty_print, indent_level).encode(encoding)
+                       pretty_print=False, indent_level=0,
+                       replace_With_html_entities=False):
+        return self.decodeContents(
+            pretty_print, indent_level, replace_with_html_entities).encode(
+            encoding)
 
     def decodeContents(self, pretty_print=False, indent_level=0,
-                       eventual_encoding=DEFAULT_OUTPUT_ENCODING):
+                       eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+                       replace_with_html_entities=False):
         """Renders the contents of this tag as a string in the given
         encoding. If encoding is None, returns a Unicode string.."""
         s=[]
@@ -648,10 +656,13 @@ class Tag(PageElement, EntitySubstitution):
             if isinstance(c, NavigableString):
                 text = c.decodeGivenEventualEncoding(eventual_encoding)
             elif isinstance(c, Tag):
-                s.append(c.decode(pretty_print, indent_level, eventual_encoding))
+                s.append(c.decode(pretty_print, indent_level, eventual_encoding,
+                                  replace_with_html_entities))
             if text and pretty_print:
                 text = text.strip()
             if text:
+                if replace_with_html_entities:
+                    text = self.substitute_html(text)
                 if pretty_print:
                     s.append(" " * (indent_level-1))
                 s.append(text)
diff --git a/tests/test_tree.py b/tests/test_tree.py
index 0b3d72e..249e7ae 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -830,6 +830,19 @@ class TestPersistence(SoupTest):
 
 class TestSubstitutions(SoupTest):
 
+    def test_entity_substitution(self):
+        soup = self.soup(
+            u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>")
+        encoded = soup.encode("utf-8", replace_with_html_entities=True)
+        self.assertEquals(encoded,
+                          self.document_for("<b>Sacr&eacute; bleu!</b>"))
+
+    def test_entity_substitution_off_by_default(self):
+        markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
+        soup = self.soup(markup)
+        encoded = soup.b.encode("utf-8")
+        self.assertEquals(encoded, markup.encode('utf-8'))
+
     def test_encoding_substitution(self):
         # Here's the <meta> tag saying that a document is
         # encoded in Shift-JIS.
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-26 21:26:15 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-26 21:26:15 -0500
commit	d7056f49c8bb3a448cec2f1a6f2de55e93c8e8d6 (patch)
tree	cc75bdadec0060288b1b76213c3ac2de34d9ab68
parent	ef5770589595e80cbd9690b64504a2166b3558fd (diff)