summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2011-02-10 11:52:30 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2011-02-10 11:52:30 -0500
commitbb9d9c5dc0af0deefc1a77542c007b7040aa55bb (patch)
tree1873ec97e3684c4676d1c62177b60e42aa4f1f2b
parent749f01e2b664dcbf4f58dfbdcaa4d314f6e3b9ef (diff)
Ported some more tests demonstrating that entities are converted to Unicode characters on the way in.
-rw-r--r--TODO5
-rw-r--r--tests/test_lxml.py47
-rw-r--r--tests/test_tree.py6
3 files changed, 54 insertions, 4 deletions
diff --git a/TODO b/TODO
index 71ff3fe..9792743 100644
--- a/TODO
+++ b/TODO
@@ -2,6 +2,11 @@ html5lib has its own Unicode, Dammit-like system. Converting the input
to Unicode should be up to the builder. The lxml builder would use
Unicode, Dammit, and the html5lib builder would be a no-op.
+Bare ampersands should be converted to HTML entities upon output.
+
+It should also be possible to convert certain Unicode characters to
+HTML entities upon output.
+
---
Here are some unit tests that fail with HTMLParser.
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index e6e015b..455c953 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -114,13 +114,58 @@ class TestLXMLBuilder(SoupTest):
soup = BeautifulSoup('<script>%s</script>' % javascript)
self.assertEquals(soup.script.string, javascript)
- def test_entities_converted_on_the_way_in(self):
+ def test_naked_ampersands(self):
+ # Ampersands are left alone.
+ text = "<p>AT&T</p>"
+ soup = self.soup(text)
+ self.assertEquals(soup.p.string, "AT&T")
+
+ # Even if they're in attribute values.
+ invalid_url = '<a href="http://example.org?a=1&b=2;3">foo</a>'
+ soup = self.soup(invalid_url)
+ self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3")
+
+ def test_entities_in_strings_converted_during_parsing(self):
# Both XML and HTML entities are converted to Unicode characters
# during parsing.
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
self.assertSoupEquals(text, expected)
+ def test_entities_in_attribute_values_converted_during_parsing(self):
+ text = '<x t="pi&#241ata">'
+ expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata"
+ soup = self.soup(text)
+ self.assertEquals(soup.x['t'], expected)
+
+ text = '<x t="pi&#xf1;ata">'
+ soup = self.soup(text)
+ self.assertEquals(soup.x['t'], expected)
+
+ text = '<x t="sacr&eacute; bleu">'
+ soup = self.soup(text)
+ self.assertEquals(
+ soup.x['t'],
+ u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu")
+
+ # This can cause valid HTML to become invalid.
+ valid_url = '<a href="http://example.org?a=1&amp;b=2;3">foo</a>'
+ soup = self.soup(valid_url)
+ self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3")
+
+ def test_smart_quotes_converted_on_the_way_in(self):
+ # Microsoft smart quotes are converted to Unicode characters during
+ # parsing.
+ quote = "<p>\x91Foo\x92</p>"
+ soup = self.soup(quote)
+ self.assertEquals(
+ soup.p.string,
+ u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
+
+ def test_non_breaking_spaces_converted_on_the_way_in(self):
+ soup = self.soup("<a>&nbsp;&nbsp;</a>")
+ self.assertEquals(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
+
# Tests below this line need work.
def test_entities_converted_on_the_way_out(self):
diff --git a/tests/test_tree.py b/tests/test_tree.py
index ed29d76..367489e 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -818,7 +818,7 @@ class TestPersistence(SoupTest):
class TestEncoding(SoupTest):
- """Test the ability to encode strings."""
+ """Test the ability to encode objects into strings."""
def test_unicode_string_can_be_encoded(self):
html = u"<b>\N{SNOWMAN}</b>"
@@ -829,5 +829,5 @@ class TestEncoding(SoupTest):
def test_tag_containing_unicode_string_can_be_encoded(self):
html = u"<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
- self.assertEquals(soup.b.encode("utf-8"),
- html.encode("utf-8"))
+ self.assertEquals(
+ soup.b.encode("utf-8"), html.encode("utf-8"))