summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2011-02-11 09:10:56 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2011-02-11 09:10:56 -0500
commitd0531c4204a67a4289025bf7108a922f680fa057 (patch)
treecdad3f97812e658d84a611b6017b7198fd97d818
parent3366ad67dc2dfdd508267efc87dfc851b612fb0d (diff)
parentd89c8878ea86a2575c87e9fad8081cfcd81e0bcd (diff)
Ported some more tests, fixed an encoding problem, and added rudimentary doctype handling.
-rw-r--r--TODO5
-rw-r--r--beautifulsoup/builder/lxml_builder.py41
-rw-r--r--beautifulsoup/element.py8
-rw-r--r--tests/test_html5lib.py32
-rw-r--r--tests/test_lxml.py119
-rw-r--r--tests/test_tree.py16
6 files changed, 215 insertions, 6 deletions
diff --git a/TODO b/TODO
index 71ff3fe..9792743 100644
--- a/TODO
+++ b/TODO
@@ -2,6 +2,11 @@ html5lib has its own Unicode, Dammit-like system. Converting the input
to Unicode should be up to the builder. The lxml builder would use
Unicode, Dammit, and the html5lib builder would be a no-op.
+Bare ampersands should be converted to HTML entities upon output.
+
+It should also be possible to convert certain Unicode characters to
+HTML entities upon output.
+
---
Here are some unit tests that fail with HTMLParser.
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py
new file mode 100644
index 0000000..8336ab4
--- /dev/null
+++ b/beautifulsoup/builder/lxml_builder.py
@@ -0,0 +1,41 @@
+from lxml import etree
+from beautifulsoup.element import Comment, Doctype
+from beautifulsoup.builder import HTMLTreeBuilder
+
+class LXMLTreeBuilder(HTMLTreeBuilder):
+
+ def __init__(self, parser_class=etree.HTMLParser):
+ self.parser = parser_class(target=self)
+ self.soup = None
+
+ def feed(self, markup):
+ self.parser.feed(markup)
+ self.parser.close()
+
+ def close(self):
+ pass
+
+ def start(self, name, attrs):
+ self.soup.handle_starttag(name, attrs)
+
+ def end(self, name):
+ self.soup.handle_endtag(name)
+
+ def data(self, content):
+ self.soup.handle_data(content)
+
+ def doctype(self, name, pubid, system):
+ self.soup.endData()
+ self.soup.handle_data(name)
+ self.soup.endData(Doctype)
+
+ def comment(self, content):
+ "Handle comments as Comment objects."
+ self.soup.endData()
+ self.soup.handle_data(content)
+ self.soup.endData(Comment)
+
+ def test_fragment_to_document(self, fragment):
+ """See `TreeBuilder`."""
+ return u'<html><body>%s</body></html>' % fragment
+
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index bd9bcbf..b2e0e12 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -346,9 +346,6 @@ class NavigableString(unicode, PageElement):
else:
raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
- def encode(self, encoding=DEFAULT_OUTPUT_ENCODING):
- return self.decode().encode(encoding)
-
def decodeGivenEventualEncoding(self, eventualEncoding):
return self
@@ -373,6 +370,11 @@ class Declaration(NavigableString):
def decodeGivenEventualEncoding(self, eventualEncoding):
return u'<!' + self + u'>'
+class Doctype(NavigableString):
+
+ def decodeGivenEventualEncoding(self, eventualEncoding):
+ return u'<!DOCTYPE ' + self + u'>'
+
class Tag(PageElement, Entities):
"""Represents a found HTML tag with its attributes and contents."""
diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py
index 3a4ee27..dada900 100644
--- a/tests/test_html5lib.py
+++ b/tests/test_html5lib.py
@@ -19,11 +19,32 @@ class TestHTML5Builder(TestLXMLBuilder):
self.assertSoupEquals(
"A bare string", "A bare string")
+ def test_correctly_nested_tables(self):
+ markup = ('<table id="1">'
+ '<tr>'
+ "<td>Here's another table:"
+ '<table id="2">'
+ '<tr><td>foo</td></tr>'
+ '</table></td>')
+
+ self.assertSoupEquals(
+ markup,
+ '<table id="1"><tbody><tr><td>Here\'s another table:'
+ '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
+ '</td></tr></tbody></table>')
+
+ self.assertSoupEquals(
+ "<table><thead><tr><td>Foo</td></tr></thead>"
+ "<tbody><tr><td>Bar</td></tr></tbody>"
+ "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
+
def test_collapsed_whitespace(self):
"""Whitespace is preserved even in tags that don't require it."""
self.assertSoupEquals("<p> </p>")
self.assertSoupEquals("<b> </b>")
+ def test_cdata(self):
+ print self.soup("<div><![CDATA[foo]]></div>")
class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
"""See `BuilderInvalidMarkupSmokeTest`."""
@@ -40,12 +61,21 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
'<blockquote><p><b>Foo</blockquote><p>Bar',
'<blockquote><p><b>Foo</b></p></blockquote><p><b>Bar</b></p>')
+ def test_table_containing_bare_markup(self):
+ # Markup should be in table cells, not directly in the table.
+ self.assertSoupEquals("<table><div>Foo</div></table>",
+ "<div>Foo</div><table></table>")
+
def test_incorrectly_nested_tables(self):
self.assertSoupEquals(
'<table><tr><table><tr id="nested">',
('<table><tbody><tr></tr></tbody></table>'
'<table><tbody><tr id="nested"></tr></tbody></table>'))
+ def test_doctype_in_body(self):
+ markup = "<p>one<!DOCTYPE foobar>two</p>"
+ self.assertSoupEquals(markup, "<p>onetwo</p>")
+
def test_foo(self):
isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
soup = self.soup(isolatin)
@@ -53,4 +83,4 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode())
utf8 = utf8.replace("\xe9", "\xc3\xa9")
- print soup
+ #print soup
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index d16e8d9..9a65f6a 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -1,5 +1,7 @@
"""Tests to ensure that the lxml tree builder generates good trees."""
+import re
+
from beautifulsoup import BeautifulSoup
from beautifulsoup.builder.lxml_builder import LXMLTreeBuilder
from beautifulsoup.element import Comment
@@ -65,6 +67,34 @@ class TestLXMLBuilder(SoupTest):
self.assertEqual(blockquote.p.b.string, 'Foo')
self.assertEqual(blockquote.b.string, 'Foo')
+ # This is a <table> tag containing another <table> tag in one of its
+ # cells.
+ TABLE_MARKUP_1 = ('<table id="1">'
+ '<tr>'
+ "<td>Here's another table:"
+ '<table id="2">'
+ '<tr><td>foo</td></tr>'
+ '</table></td>')
+
+ def test_correctly_nested_tables(self):
+ markup = ('<table id="1">'
+ '<tr>'
+ "<td>Here's another table:"
+ '<table id="2">'
+ '<tr><td>foo</td></tr>'
+ '</table></td>')
+
+ self.assertSoupEquals(
+ markup,
+ '<table id="1"><tr><td>Here\'s another table:'
+ '<table id="2"><tr><td>foo</td></tr></table>'
+ '</td></tr></table>')
+
+ self.assertSoupEquals(
+ "<table><thead><tr><td>Foo</td></tr></thead>"
+ "<tbody><tr><td>Bar</td></tr></tbody>"
+ "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
+
def test_collapsed_whitespace(self):
"""In most tags, whitespace is collapsed."""
self.assertSoupEquals("<p> </p>", "<p> </p>")
@@ -114,14 +144,82 @@ class TestLXMLBuilder(SoupTest):
soup = BeautifulSoup('<script>%s</script>' % javascript)
self.assertEquals(soup.script.string, javascript)
+ def test_naked_ampersands(self):
+ # Ampersands are left alone.
+ text = "<p>AT&T</p>"
+ soup = self.soup(text)
+ self.assertEquals(soup.p.string, "AT&T")
+
+ # Even if they're in attribute values.
+ invalid_url = '<a href="http://example.org?a=1&b=2;3">foo</a>'
+ soup = self.soup(invalid_url)
+ self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3")
+
+ def test_entities_in_strings_converted_during_parsing(self):
+ # Both XML and HTML entities are converted to Unicode characters
+ # during parsing.
+ text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
+ expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
+ self.assertSoupEquals(text, expected)
+
+ def test_entities_in_attribute_values_converted_during_parsing(self):
+ text = '<x t="pi&#241ata">'
+ expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata"
+ soup = self.soup(text)
+ self.assertEquals(soup.x['t'], expected)
+
+ text = '<x t="pi&#xf1;ata">'
+ soup = self.soup(text)
+ self.assertEquals(soup.x['t'], expected)
+
+ text = '<x t="sacr&eacute; bleu">'
+ soup = self.soup(text)
+ self.assertEquals(
+ soup.x['t'],
+ u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu")
+
+ # This can cause valid HTML to become invalid.
+ valid_url = '<a href="http://example.org?a=1&amp;b=2;3">foo</a>'
+ soup = self.soup(valid_url)
+ self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3")
+
+ def test_smart_quotes_converted_on_the_way_in(self):
+ # Microsoft smart quotes are converted to Unicode characters during
+ # parsing.
+ quote = "<p>\x91Foo\x92</p>"
+ soup = self.soup(quote)
+ self.assertEquals(
+ soup.p.string,
+ u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
+
+ def test_non_breaking_spaces_converted_on_the_way_in(self):
+ soup = self.soup("<a>&nbsp;&nbsp;</a>")
+ self.assertEquals(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
+
+ # Tests below this line need work.
+
+ #def test_doctype(self):
+ # xml = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"><html>foo</html></p>'
+ # self.assertSoupEquals(xml)
+
+
+ #def test_cdata(self):
+ # print self.soup("<div><![CDATA[foo]]></div>")
+
+ def test_entities_converted_on_the_way_out(self):
+ text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
+ expected = u"&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;".encode("utf-8")
+ soup = BeautifulSoup(text)
+ str = soup.p.string
+ #self.assertEquals(str.encode("utf-8"), expected)
+
def test_foo(self):
isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
soup = self.soup(isolatin)
utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode())
utf8 = utf8.replace("\xe9", "\xc3\xa9")
-
- print soup
+ #print soup
class TestLXMLBuilderInvalidMarkup(SoupTest):
@@ -134,6 +232,20 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
markup at all.
"""
+ def test_table_containing_bare_markup(self):
+ # Markup should be in table cells, not directly in the table.
+ self.assertSoupEquals("<table><div>Foo</div></table>")
+
+ def test_incorrectly_nested_table(self):
+ # The second <table> tag is floating in the <tr> tag
+ # rather than being inside a <td>.
+ bad_markup = ('<table id="1">'
+ '<tr>'
+ "<td>Here's another table:</td>"
+ '<table id="2">'
+ '<tr><td>foo</td></tr>'
+ '</table></td>')
+
def test_unclosed_block_level_elements(self):
# Unclosed block-level elements should be closed.
self.assertSoupEquals(
@@ -157,4 +269,7 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
'<table><tr><table><tr id="nested">',
'<table><tr><table><tr id="nested"></tr></table></tr></table>')
+ def test_doctype_in_body(self):
+ markup = "<p>one<!DOCTYPE foobar>two</p>"
+ self.assertSoupEquals(markup)
diff --git a/tests/test_tree.py b/tests/test_tree.py
index eac4e72..367489e 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -815,3 +815,19 @@ class TestPersistence(SoupTest):
dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
loaded = pickle.loads(dumped)
self.assertEqual(loaded.decode(), soup.decode())
+
+
+class TestEncoding(SoupTest):
+ """Test the ability to encode objects into strings."""
+
+ def test_unicode_string_can_be_encoded(self):
+ html = u"<b>\N{SNOWMAN}</b>"
+ soup = self.soup(html)
+ self.assertEquals(soup.b.string.encode("utf-8"),
+ u"\N{SNOWMAN}".encode("utf-8"))
+
+ def test_tag_containing_unicode_string_can_be_encoded(self):
+ html = u"<b>\N{SNOWMAN}</b>"
+ soup = self.soup(html)
+ self.assertEquals(
+ soup.b.encode("utf-8"), html.encode("utf-8"))