6 files changed, 47 insertions, 164 deletions
diff --git a/src/beautifulsoup/tests/helpers.py b/src/beautifulsoup/tests/helpers.py
deleted file mode 100644
index 20d087e..0000000
--- a/src/beautifulsoup/tests/helpers.py
+++ /dev/null
@@ -1,154 +0,0 @@
-"""Helper classes for tests."""
-
-import unittest
-from beautifulsoup import BeautifulSoup
-from beautifulsoup.element import Comment, SoupStrainer
-from beautifulsoup.builder.lxml_builder import LXMLTreeBuilder
-
-class SoupTest(unittest.TestCase):
-
-    def setUp(self):
-        # LXMLTreeBuilder won't handle bad markup, but that's fine,
-        # since all the parsing tests take place in parser-specific
-        # test suites that override default_builder.
-        self.default_builder = LXMLTreeBuilder()
-
-    def soup(self, markup, **kwargs):
-        """Build a Beautiful Soup object from markup."""
-        return BeautifulSoup(markup, builder=self.default_builder, **kwargs)
-
-    def document_for(self, markup):
-        """Turn an HTML fragment into a document.
-
-        The details depend on the builder.
-        """
-        return self.default_builder.test_fragment_to_document(markup)
-
-    def assertSoupEquals(self, to_parse, compare_parsed_to=None):
-        builder = self.default_builder
-        obj = BeautifulSoup(to_parse, builder=builder)
-        if compare_parsed_to is None:
-            compare_parsed_to = to_parse
-
-        self.assertEquals(obj.decode(), self.document_for(compare_parsed_to))
-
-
-
-class BuilderSmokeTest(SoupTest):
-    """A generic smoke test for tree builders.
-
-    Subclasses of this test ensure that all of Beautiful Soup's tree
-    builders generate more or less the same trees. It's okay for trees
-    to differ, especially when given invalid markup--just override the
-    appropriate test method to demonstrate how one tree builder
-    differs from others.
-    """
-
-    def test_bare_string(self):
-        # A bare string is turned into some kind of HTML document or
-        # fragment recognizable as the original string.
-        self.assertSoupEquals("A bare string")
-
-    def test_mixed_case_tags(self):
-        # Mixed-case tags are folded to lowercase.
-        self.assertSoupEquals(
-            "<a><B><Cd><EFG></efg></CD></b></A>",
-            "<a><b><cd><efg></efg></cd></b></a>")
-
-    def test_self_closing(self):
-        # HTML's self-closing tags are recognized as such.
-        self.assertSoupEquals(
-            "<p>A <meta> tag</p>", "<p>A <meta /> tag</p>")
-
-        self.assertSoupEquals(
-            "<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>")
-
-    def test_comment(self):
-        # Comments are represented as Comment objects.
-        markup = "<p>foo<!--foobar-->baz</p>"
-        self.assertSoupEquals(markup)
-
-        soup = self.soup(markup)
-        comment = soup.find(text="foobar")
-        self.assertEquals(comment.__class__, Comment)
-
-    def test_nested_inline_elements(self):
-        # Inline tags can be nested indefinitely.
-        b_tag = "<b>Inside a B tag</b>"
-        self.assertSoupEquals(b_tag)
-
-        nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
-        self.assertSoupEquals(nested_b_tag)
-
-        double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
-        self.assertSoupEquals(nested_b_tag)
-
-    def test_nested_block_level_elements(self):
-        soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
-        blockquote = soup.blockquote
-        self.assertEqual(blockquote.p.b.string, 'Foo')
-        self.assertEqual(blockquote.b.string, 'Foo')
-
-    def test_collapsed_whitespace(self):
-        """In most tags, whitespace is collapsed."""
-        self.assertSoupEquals("<p>   </p>", "<p> </p>")
-
-    def test_preserved_whitespace_in_pre_and_textarea(self):
-        """In <pre> and <textarea> tags, whitespace is preserved."""
-        self.assertSoupEquals("<pre>   </pre>")
-        self.assertSoupEquals("<textarea> woo  </textarea>")
-
-    def test_single_quote_attribute_values_become_double_quotes(self):
-        self.assertSoupEquals("<foo attr='bar'></foo>",
-                              '<foo attr="bar"></foo>')
-
-    def test_attribute_values_with_nested_quotes_are_left_alone(self):
-        text = """<foo attr='bar "brawls" happen'>a</foo>"""
-        self.assertSoupEquals(text)
-
-    def test_attribute_values_with_double_nested_quotes_get_quoted(self):
-        text = """<foo attr='bar "brawls" happen'>a</foo>"""
-        soup = self.soup(text)
-        soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
-        self.assertSoupEquals(
-            soup.foo.decode(),
-            """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>""")
-
-    def test_ampersand_in_attribute_value_gets_quoted(self):
-        self.assertSoupEquals('<this is="really messed up & stuff"></this>',
-                              '<this is="really messed up &amp; stuff"></this>')
-
-
-class BuilderInvalidMarkupSmokeTest(SoupTest):
-    """Tests of invalid markup.
-
-    These are very likely to give different results for different tree
-    builders. It's not required that a tree builder handle invalid
-    markup at all.
-    """
-
-    def test_unclosed_block_level_elements(self):
-        # Unclosed block-level elements should be closed.
-        self.assertSoupEquals(
-            '<blockquote><p><b>Foo</blockquote><p>Bar',
-            '<blockquote><p><b>Foo</b></p></blockquote><p>Bar</p>')
-
-    def test_fake_self_closing_tag(self):
-        # If a self-closing tag presents as a normal tag, the 'open'
-        # tag is treated as an instance of the self-closing tag and
-        # the 'close' tag is ignored.
-        self.assertSoupEquals(
-            "<item><link>http://foo.com/</link></item>",
-            "<item><link />http://foo.com/</item>")
-
-    def test_boolean_attribute_with_no_value_gets_empty_value(self):
-        soup = self.soup("<table><td nowrap>foo</td></table>")
-        self.assertEquals(soup.table.td['nowrap'], '')
-
-    def test_incorrectly_nested_tables(self):
-        self.assertSoupEquals(
-            '<table><tr><table><tr id="nested">',
-            '<table><tr><table><tr id="nested"></tr></table></tr></table>')
-
-
-
diff --git a/src/beautifulsoup/tests/test_html5lib.py b/src/beautifulsoup/tests/test_html5lib.py
index 4ffd968..7164dac 100644
--- a/src/beautifulsoup/tests/test_html5lib.py
+++ b/src/beautifulsoup/tests/test_html5lib.py
@@ -1,5 +1,8 @@
-from helpers import BuilderInvalidMarkupSmokeTest, BuilderSmokeTest
 from beautifulsoup.builder.html5lib_builder import HTML5TreeBuilder
+from beautifulsoup.testing import (
+    BuilderInvalidMarkupSmokeTest,
+    BuilderSmokeTest,
+)
 
 
 class TestHTML5Builder(BuilderSmokeTest):
@@ -30,4 +33,11 @@ class TestHTML5BuilderInvalidMarkup(BuilderInvalidMarkupSmokeTest):
              '<table><tbody><tr id="nested"></tr></tbody></table>'))
 
 
+    def test_foo(self):
+        isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
+        soup = self.soup(isolatin)
 
+        utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode())
+        utf8 = utf8.replace("\xe9", "\xc3\xa9")
+
+        print soup
diff --git a/src/beautifulsoup/tests/test_lxml.py b/src/beautifulsoup/tests/test_lxml.py
index cd22b6f..b53ee42 100644
--- a/src/beautifulsoup/tests/test_lxml.py
+++ b/src/beautifulsoup/tests/test_lxml.py
@@ -1,6 +1,9 @@
 """Tests to ensure that the lxml tree builder generates good trees."""
 
-from helpers import BuilderInvalidMarkupSmokeTest, BuilderSmokeTest
+from beautifulsoup.testing import (
+    BuilderInvalidMarkupSmokeTest,
+    BuilderSmokeTest,
+)
 
 class TestLXMLBuilder(BuilderSmokeTest):
     """See `BuilderSmokeTest`."""
@@ -10,6 +13,15 @@ class TestLXMLBuilder(BuilderSmokeTest):
         self.assertSoupEquals(
             "A bare string", "<p>A bare string</p>")
 
+    def test_foo(self):
+        isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
+        soup = self.soup(isolatin)
+
+        utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode())
+        utf8 = utf8.replace("\xe9", "\xc3\xa9")
+
+        print soup
+
 
 class TestLXMLBuilderInvalidMarkup(BuilderInvalidMarkupSmokeTest):
     """See `BuilderInvalidMarkupSmokeTest`."""
diff --git a/src/beautifulsoup/tests/test_soup.py b/src/beautifulsoup/tests/test_soup.py
index d95cba6..ec0394d 100644
--- a/src/beautifulsoup/tests/test_soup.py
+++ b/src/beautifulsoup/tests/test_soup.py
@@ -2,9 +2,9 @@
 """Tests of Beautiful Soup as a whole."""
 
 import unittest
-from helpers import SoupTest
 from beautifulsoup.element import SoupStrainer
 from beautifulsoup.dammit import UnicodeDammit
+from beautifulsoup.testing import SoupTest
 
 
 class TestEncodingConversion(SoupTest):
@@ -48,6 +48,15 @@ class TestEncodingConversion(SoupTest):
         soup_from_unicode = self.soup(self.unicode_data)
         self.assertEquals(soup_from_unicode.encode('utf-8'), self.utf8_data)
 
+    def test_hebrew(self):
+        # A real-world test to make sure we can convert ISO-8859-9 (a
+        # Hebrew encoding) to UTF-8.
+        iso_8859_8= '<HTML><HEAD><TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE></HEAD><BODY><H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\xed\xe5\xec\xf9</BODY></HTML>'
+        utf8 = '<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9</body></html>'
+        soup = self.soup(iso_8859_8, fromEncoding="iso-8859-8")
+        self.assertEquals(soup.originalEncoding, 'iso-8859-8')
+        self.assertEquals(soup.encode('utf-8'), utf8)
+
 
 class TestSelectiveParsing(SoupTest):
 
@@ -58,14 +67,20 @@ class TestSelectiveParsing(SoupTest):
         self.assertEquals(soup.encode(), "<b>Yes</b><b>Yes <c>Yes</c></b>")
 
 
-
 class TestUnicodeDammit(unittest.TestCase):
     """Standalone tests of Unicode, Dammit."""
 
-    def test_smart_quote_replacement(self):
-        markup = "<foo>\x92</foo>"
+    def test_smart_quotes_to_xml_entities(self):
+        markup = "<foo>\x91\x92\x93\x94</foo>"
         dammit = UnicodeDammit(markup)
-        self.assertEquals(dammit.unicode, "<foo>&#x2019;</foo>")
+        self.assertEquals(
+            dammit.unicode, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
+
+    def test_smart_quotes_to_html_entities(self):
+        markup = "<foo>\x91\x92\x93\x94</foo>"
+        dammit = UnicodeDammit(markup, smartQuotesTo="html")
+        self.assertEquals(
+            dammit.unicode, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
 
     def test_detect_utf8(self):
         utf8 = "\xc3\xa9"
@@ -87,7 +102,7 @@ class TestUnicodeDammit(unittest.TestCase):
 
     def test_ignore_inappropriate_codecs(self):
         utf8_data = u"Räksmörgås".encode("utf-8")
-        dammit = UnicodeDammit(utf8_data, ["iso-8859-1"])
+        dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
         self.assertEquals(dammit.originalEncoding, 'utf-8')
 
     def test_ignore_invalid_codecs(self):
diff --git a/src/beautifulsoup/tests/test_strainer.py b/src/beautifulsoup/tests/test_strainer.py
index 9a91463..f078935 100644
--- a/src/beautifulsoup/tests/test_strainer.py
+++ b/src/beautifulsoup/tests/test_strainer.py
@@ -1,7 +1,7 @@
 import unittest
-from helpers import SoupTest
 from beautifulsoup import BeautifulSoup
 from beautifulsoup.element import SoupStrainer
+from beautifulsoup.testing import SoupTest
 
 class TestSoupStrainer(unittest.TestCase):
 
diff --git a/src/beautifulsoup/tests/test_tree.py b/src/beautifulsoup/tests/test_tree.py
index 42430d3..344a462 100644
--- a/src/beautifulsoup/tests/test_tree.py
+++ b/src/beautifulsoup/tests/test_tree.py
@@ -12,7 +12,7 @@ methods tested here.
 import re
 from beautifulsoup import BeautifulSoup
 from beautifulsoup.element import SoupStrainer, Tag
-from helpers import SoupTest
+from beautifulsoup.testing import SoupTest
 
 class TreeTest(SoupTest):