summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2011-02-26 21:26:15 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2011-02-26 21:26:15 -0500
commitd7056f49c8bb3a448cec2f1a6f2de55e93c8e8d6 (patch)
treecc75bdadec0060288b1b76213c3ac2de34d9ab68
parentef5770589595e80cbd9690b64504a2166b3558fd (diff)
First stab at HTML entity replacement.
-rw-r--r--beautifulsoup/__init__.py6
-rw-r--r--beautifulsoup/dammit.py4
-rw-r--r--beautifulsoup/element.py27
-rw-r--r--tests/test_tree.py13
4 files changed, 38 insertions, 12 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index cee55e7..f4c2a95 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -263,7 +263,8 @@ class BeautifulSoup(Tag):
self.currentData.append(data)
def decode(self, pretty_print=False, indent_level=0,
- eventual_encoding=DEFAULT_OUTPUT_ENCODING):
+ eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+ replace_with_html_entities=False):
"""Returns a string or Unicode representation of this document.
To get Unicode, pass None for encoding."""
if self.is_xml:
@@ -275,7 +276,8 @@ class BeautifulSoup(Tag):
else:
prefix = u''
return prefix + super(BeautifulSoup, self).decode(
- pretty_print, indent_level, eventual_encoding)
+ pretty_print, indent_level, eventual_encoding,
+ replace_with_html_entities)
class StopParsing(Exception):
diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py
index 9833bd4..31dfa95 100644
--- a/beautifulsoup/dammit.py
+++ b/beautifulsoup/dammit.py
@@ -37,8 +37,8 @@ class EntitySubstitution(object):
for codepoint, name in codepoint2name.items():
if codepoint == 34:
# There's no point in turning the quotation mark into
- # &quot;, unless it happens in an attribute value, which
- # is done elsewhere.
+ # &quot;, unless it happens within an attribute value, which
+ # is handled elsewhere.
continue;
character = unichr(codepoint)
characters.append(character)
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index 23f8c33..f3a59d4 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -561,11 +561,14 @@ class Tag(PageElement, EntitySubstitution):
return self.encode()
def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
- pretty_print=False, indent_level=0):
- return self.decode(pretty_print, indent_level, encoding).encode(encoding)
+ pretty_print=False, indent_level=0,
+ replace_with_html_entities=False):
+ return self.decode(pretty_print, indent_level, encoding,
+ replace_with_html_entities).encode(encoding)
def decode(self, pretty_print=False, indent_level=0,
- eventual_encoding=DEFAULT_OUTPUT_ENCODING):
+ eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+ replace_with_html_entities=False):
"""Returns a string or Unicode representation of this tag and
its contents. To get Unicode, pass None for encoding."""
@@ -597,7 +600,8 @@ class Tag(PageElement, EntitySubstitution):
space = (' ' * (indentTag-1))
indentContents = indentTag + 1
contents = self.decodeContents(pretty_print, indentContents,
- eventual_encoding)
+ eventual_encoding,
+ replace_with_html_entities)
if self.hidden:
s = contents
else:
@@ -635,11 +639,15 @@ class Tag(PageElement, EntitySubstitution):
return self.encode(encoding, True)
def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
- pretty_print=False, indent_level=0):
- return self.decodeContents(pretty_print, indent_level).encode(encoding)
+ pretty_print=False, indent_level=0,
+ replace_With_html_entities=False):
+ return self.decodeContents(
+ pretty_print, indent_level, replace_with_html_entities).encode(
+ encoding)
def decodeContents(self, pretty_print=False, indent_level=0,
- eventual_encoding=DEFAULT_OUTPUT_ENCODING):
+ eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+ replace_with_html_entities=False):
"""Renders the contents of this tag as a string in the given
encoding. If encoding is None, returns a Unicode string.."""
s=[]
@@ -648,10 +656,13 @@ class Tag(PageElement, EntitySubstitution):
if isinstance(c, NavigableString):
text = c.decodeGivenEventualEncoding(eventual_encoding)
elif isinstance(c, Tag):
- s.append(c.decode(pretty_print, indent_level, eventual_encoding))
+ s.append(c.decode(pretty_print, indent_level, eventual_encoding,
+ replace_with_html_entities))
if text and pretty_print:
text = text.strip()
if text:
+ if replace_with_html_entities:
+ text = self.substitute_html(text)
if pretty_print:
s.append(" " * (indent_level-1))
s.append(text)
diff --git a/tests/test_tree.py b/tests/test_tree.py
index 0b3d72e..249e7ae 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -830,6 +830,19 @@ class TestPersistence(SoupTest):
class TestSubstitutions(SoupTest):
+ def test_entity_substitution(self):
+ soup = self.soup(
+ u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>")
+ encoded = soup.encode("utf-8", replace_with_html_entities=True)
+ self.assertEquals(encoded,
+ self.document_for("<b>Sacr&eacute; bleu!</b>"))
+
+ def test_entity_substitution_off_by_default(self):
+ markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
+ soup = self.soup(markup)
+ encoded = soup.b.encode("utf-8")
+ self.assertEquals(encoded, markup.encode('utf-8'))
+
def test_encoding_substitution(self):
# Here's the <meta> tag saying that a document is
# encoded in Shift-JIS.