summaryrefslogtreecommitdiff
path: root/tests/test_html5lib.py
diff options
context:
space:
mode:
Diffstat (limited to 'tests/test_html5lib.py')
-rw-r--r--tests/test_html5lib.py57
1 files changed, 57 insertions, 0 deletions
diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py
new file mode 100644
index 0000000..417e87b
--- /dev/null
+++ b/tests/test_html5lib.py
@@ -0,0 +1,57 @@
+from beautifulsoup.builder.html5lib_builder import HTML5TreeBuilder
+from beautifulsoup.testing import (
+ BuilderInvalidMarkupSmokeTest,
+ BuilderSmokeTest,
+)
+
+
+class TestHTML5Builder(BuilderSmokeTest):
+ """See `BuilderSmokeTest`."""
+
+ @property
+ def default_builder(self):
+ return HTML5TreeBuilder()
+
+ def test_bare_string(self):
+ # A bare string is turned into some kind of HTML document or
+ # fragment recognizable as the original string.
+ #
+ # In this case, lxml puts a <p> tag around the bare string.
+ self.assertSoupEquals(
+ "A bare string", "A bare string")
+
+ def test_collapsed_whitespace(self):
+ """Whitespace is preserved even in tags that don't require it."""
+ self.assertSoupEquals("<p> </p>")
+ self.assertSoupEquals("<b> </b>")
+
+
+class TestHTML5BuilderInvalidMarkup(BuilderInvalidMarkupSmokeTest):
+ """See `BuilderInvalidMarkupSmokeTest`."""
+
+ @property
+ def default_builder(self):
+ return HTML5TreeBuilder()
+
+ def test_unclosed_block_level_elements(self):
+ # The unclosed <b> tag is closed so that the block-level tag
+ # can be closed, and another <b> tag is inserted after the
+ # next block-level tag begins.
+ self.assertSoupEquals(
+ '<blockquote><p><b>Foo</blockquote><p>Bar',
+ '<blockquote><p><b>Foo</b></p></blockquote><p><b>Bar</b></p>')
+
+ def test_incorrectly_nested_tables(self):
+ self.assertSoupEquals(
+ '<table><tr><table><tr id="nested">',
+ ('<table><tbody><tr></tr></tbody></table>'
+ '<table><tbody><tr id="nested"></tr></tbody></table>'))
+
+ def test_foo(self):
+ isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
+ soup = self.soup(isolatin)
+
+ utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode())
+ utf8 = utf8.replace("\xe9", "\xc3\xa9")
+
+ print soup