diff options
Diffstat (limited to 'src/beautifulsoup/tests')
-rw-r--r-- | src/beautifulsoup/tests/helpers.py | 154 | ||||
-rw-r--r-- | src/beautifulsoup/tests/test_html5lib.py | 12 | ||||
-rw-r--r-- | src/beautifulsoup/tests/test_lxml.py | 14 | ||||
-rw-r--r-- | src/beautifulsoup/tests/test_soup.py | 27 | ||||
-rw-r--r-- | src/beautifulsoup/tests/test_strainer.py | 2 | ||||
-rw-r--r-- | src/beautifulsoup/tests/test_tree.py | 2 |
6 files changed, 47 insertions, 164 deletions
diff --git a/src/beautifulsoup/tests/helpers.py b/src/beautifulsoup/tests/helpers.py deleted file mode 100644 index 20d087e..0000000 --- a/src/beautifulsoup/tests/helpers.py +++ /dev/null @@ -1,154 +0,0 @@ -"""Helper classes for tests.""" - -import unittest -from beautifulsoup import BeautifulSoup -from beautifulsoup.element import Comment, SoupStrainer -from beautifulsoup.builder.lxml_builder import LXMLTreeBuilder - -class SoupTest(unittest.TestCase): - - def setUp(self): - # LXMLTreeBuilder won't handle bad markup, but that's fine, - # since all the parsing tests take place in parser-specific - # test suites that override default_builder. - self.default_builder = LXMLTreeBuilder() - - def soup(self, markup, **kwargs): - """Build a Beautiful Soup object from markup.""" - return BeautifulSoup(markup, builder=self.default_builder, **kwargs) - - def document_for(self, markup): - """Turn an HTML fragment into a document. - - The details depend on the builder. - """ - return self.default_builder.test_fragment_to_document(markup) - - def assertSoupEquals(self, to_parse, compare_parsed_to=None): - builder = self.default_builder - obj = BeautifulSoup(to_parse, builder=builder) - if compare_parsed_to is None: - compare_parsed_to = to_parse - - self.assertEquals(obj.decode(), self.document_for(compare_parsed_to)) - - - -class BuilderSmokeTest(SoupTest): - """A generic smoke test for tree builders. - - Subclasses of this test ensure that all of Beautiful Soup's tree - builders generate more or less the same trees. It's okay for trees - to differ, especially when given invalid markup--just override the - appropriate test method to demonstrate how one tree builder - differs from others. - """ - - def test_bare_string(self): - # A bare string is turned into some kind of HTML document or - # fragment recognizable as the original string. - self.assertSoupEquals("A bare string") - - def test_mixed_case_tags(self): - # Mixed-case tags are folded to lowercase. - self.assertSoupEquals( - "<a><B><Cd><EFG></efg></CD></b></A>", - "<a><b><cd><efg></efg></cd></b></a>") - - def test_self_closing(self): - # HTML's self-closing tags are recognized as such. - self.assertSoupEquals( - "<p>A <meta> tag</p>", "<p>A <meta /> tag</p>") - - self.assertSoupEquals( - "<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>") - - def test_comment(self): - # Comments are represented as Comment objects. - markup = "<p>foo<!--foobar-->baz</p>" - self.assertSoupEquals(markup) - - soup = self.soup(markup) - comment = soup.find(text="foobar") - self.assertEquals(comment.__class__, Comment) - - def test_nested_inline_elements(self): - # Inline tags can be nested indefinitely. - b_tag = "<b>Inside a B tag</b>" - self.assertSoupEquals(b_tag) - - nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>" - self.assertSoupEquals(nested_b_tag) - - double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>" - self.assertSoupEquals(nested_b_tag) - - def test_nested_block_level_elements(self): - soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>') - blockquote = soup.blockquote - self.assertEqual(blockquote.p.b.string, 'Foo') - self.assertEqual(blockquote.b.string, 'Foo') - - def test_collapsed_whitespace(self): - """In most tags, whitespace is collapsed.""" - self.assertSoupEquals("<p> </p>", "<p> </p>") - - def test_preserved_whitespace_in_pre_and_textarea(self): - """In <pre> and <textarea> tags, whitespace is preserved.""" - self.assertSoupEquals("<pre> </pre>") - self.assertSoupEquals("<textarea> woo </textarea>") - - def test_single_quote_attribute_values_become_double_quotes(self): - self.assertSoupEquals("<foo attr='bar'></foo>", - '<foo attr="bar"></foo>') - - def test_attribute_values_with_nested_quotes_are_left_alone(self): - text = """<foo attr='bar "brawls" happen'>a</foo>""" - self.assertSoupEquals(text) - - def test_attribute_values_with_double_nested_quotes_get_quoted(self): - text = """<foo attr='bar "brawls" happen'>a</foo>""" - soup = self.soup(text) - soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' - self.assertSoupEquals( - soup.foo.decode(), - """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>""") - - def test_ampersand_in_attribute_value_gets_quoted(self): - self.assertSoupEquals('<this is="really messed up & stuff"></this>', - '<this is="really messed up & stuff"></this>') - - -class BuilderInvalidMarkupSmokeTest(SoupTest): - """Tests of invalid markup. - - These are very likely to give different results for different tree - builders. It's not required that a tree builder handle invalid - markup at all. - """ - - def test_unclosed_block_level_elements(self): - # Unclosed block-level elements should be closed. - self.assertSoupEquals( - '<blockquote><p><b>Foo</blockquote><p>Bar', - '<blockquote><p><b>Foo</b></p></blockquote><p>Bar</p>') - - def test_fake_self_closing_tag(self): - # If a self-closing tag presents as a normal tag, the 'open' - # tag is treated as an instance of the self-closing tag and - # the 'close' tag is ignored. - self.assertSoupEquals( - "<item><link>http://foo.com/</link></item>", - "<item><link />http://foo.com/</item>") - - def test_boolean_attribute_with_no_value_gets_empty_value(self): - soup = self.soup("<table><td nowrap>foo</td></table>") - self.assertEquals(soup.table.td['nowrap'], '') - - def test_incorrectly_nested_tables(self): - self.assertSoupEquals( - '<table><tr><table><tr id="nested">', - '<table><tr><table><tr id="nested"></tr></table></tr></table>') - - - diff --git a/src/beautifulsoup/tests/test_html5lib.py b/src/beautifulsoup/tests/test_html5lib.py index 4ffd968..7164dac 100644 --- a/src/beautifulsoup/tests/test_html5lib.py +++ b/src/beautifulsoup/tests/test_html5lib.py @@ -1,5 +1,8 @@ -from helpers import BuilderInvalidMarkupSmokeTest, BuilderSmokeTest from beautifulsoup.builder.html5lib_builder import HTML5TreeBuilder +from beautifulsoup.testing import ( + BuilderInvalidMarkupSmokeTest, + BuilderSmokeTest, +) class TestHTML5Builder(BuilderSmokeTest): @@ -30,4 +33,11 @@ class TestHTML5BuilderInvalidMarkup(BuilderInvalidMarkupSmokeTest): '<table><tbody><tr id="nested"></tr></tbody></table>')) + def test_foo(self): + isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" + soup = self.soup(isolatin) + utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) + utf8 = utf8.replace("\xe9", "\xc3\xa9") + + print soup diff --git a/src/beautifulsoup/tests/test_lxml.py b/src/beautifulsoup/tests/test_lxml.py index cd22b6f..b53ee42 100644 --- a/src/beautifulsoup/tests/test_lxml.py +++ b/src/beautifulsoup/tests/test_lxml.py @@ -1,6 +1,9 @@ """Tests to ensure that the lxml tree builder generates good trees.""" -from helpers import BuilderInvalidMarkupSmokeTest, BuilderSmokeTest +from beautifulsoup.testing import ( + BuilderInvalidMarkupSmokeTest, + BuilderSmokeTest, +) class TestLXMLBuilder(BuilderSmokeTest): """See `BuilderSmokeTest`.""" @@ -10,6 +13,15 @@ class TestLXMLBuilder(BuilderSmokeTest): self.assertSoupEquals( "A bare string", "<p>A bare string</p>") + def test_foo(self): + isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" + soup = self.soup(isolatin) + + utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) + utf8 = utf8.replace("\xe9", "\xc3\xa9") + + print soup + class TestLXMLBuilderInvalidMarkup(BuilderInvalidMarkupSmokeTest): """See `BuilderInvalidMarkupSmokeTest`.""" diff --git a/src/beautifulsoup/tests/test_soup.py b/src/beautifulsoup/tests/test_soup.py index d95cba6..ec0394d 100644 --- a/src/beautifulsoup/tests/test_soup.py +++ b/src/beautifulsoup/tests/test_soup.py @@ -2,9 +2,9 @@ """Tests of Beautiful Soup as a whole.""" import unittest -from helpers import SoupTest from beautifulsoup.element import SoupStrainer from beautifulsoup.dammit import UnicodeDammit +from beautifulsoup.testing import SoupTest class TestEncodingConversion(SoupTest): @@ -48,6 +48,15 @@ class TestEncodingConversion(SoupTest): soup_from_unicode = self.soup(self.unicode_data) self.assertEquals(soup_from_unicode.encode('utf-8'), self.utf8_data) + def test_hebrew(self): + # A real-world test to make sure we can convert ISO-8859-9 (a + # Hebrew encoding) to UTF-8. + iso_8859_8= '<HTML><HEAD><TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE></HEAD><BODY><H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\xed\xe5\xec\xf9</BODY></HTML>' + utf8 = '<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9</body></html>' + soup = self.soup(iso_8859_8, fromEncoding="iso-8859-8") + self.assertEquals(soup.originalEncoding, 'iso-8859-8') + self.assertEquals(soup.encode('utf-8'), utf8) + class TestSelectiveParsing(SoupTest): @@ -58,14 +67,20 @@ class TestSelectiveParsing(SoupTest): self.assertEquals(soup.encode(), "<b>Yes</b><b>Yes <c>Yes</c></b>") - class TestUnicodeDammit(unittest.TestCase): """Standalone tests of Unicode, Dammit.""" - def test_smart_quote_replacement(self): - markup = "<foo>\x92</foo>" + def test_smart_quotes_to_xml_entities(self): + markup = "<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup) - self.assertEquals(dammit.unicode, "<foo>’</foo>") + self.assertEquals( + dammit.unicode, "<foo>‘’“”</foo>") + + def test_smart_quotes_to_html_entities(self): + markup = "<foo>\x91\x92\x93\x94</foo>" + dammit = UnicodeDammit(markup, smartQuotesTo="html") + self.assertEquals( + dammit.unicode, "<foo>‘’“”</foo>") def test_detect_utf8(self): utf8 = "\xc3\xa9" @@ -87,7 +102,7 @@ class TestUnicodeDammit(unittest.TestCase): def test_ignore_inappropriate_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") - dammit = UnicodeDammit(utf8_data, ["iso-8859-1"]) + dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) self.assertEquals(dammit.originalEncoding, 'utf-8') def test_ignore_invalid_codecs(self): diff --git a/src/beautifulsoup/tests/test_strainer.py b/src/beautifulsoup/tests/test_strainer.py index 9a91463..f078935 100644 --- a/src/beautifulsoup/tests/test_strainer.py +++ b/src/beautifulsoup/tests/test_strainer.py @@ -1,7 +1,7 @@ import unittest -from helpers import SoupTest from beautifulsoup import BeautifulSoup from beautifulsoup.element import SoupStrainer +from beautifulsoup.testing import SoupTest class TestSoupStrainer(unittest.TestCase): diff --git a/src/beautifulsoup/tests/test_tree.py b/src/beautifulsoup/tests/test_tree.py index 42430d3..344a462 100644 --- a/src/beautifulsoup/tests/test_tree.py +++ b/src/beautifulsoup/tests/test_tree.py @@ -12,7 +12,7 @@ methods tested here. import re from beautifulsoup import BeautifulSoup from beautifulsoup.element import SoupStrainer, Tag -from helpers import SoupTest +from beautifulsoup.testing import SoupTest class TreeTest(SoupTest): |