4 files changed, 75 insertions, 22 deletions
diff --git a/BeautifulSoup.py b/BeautifulSoup.py
index aceb6d3..c0f7482 100644
--- a/BeautifulSoup.py
+++ b/BeautifulSoup.py
@@ -1029,7 +1029,7 @@ class TreeBuilder(Entities):
         pass
 
 
-class XMLParserBuilder(HTMLParser, TreeBuilder):
+class HTMLParserXMLTreeBuilder(HTMLParser, TreeBuilder):
 
     """
     This class defines a basic tree builder based on Python's built-in
@@ -1066,7 +1066,7 @@ class XMLParserBuilder(HTMLParser, TreeBuilder):
         <! --Comment--> (Extraneous whitespace in declaration)
 
     You can pass in a custom list of (RE object, replace method)
-    tuples to get XMLParserBuilder to scrub your input the way you
+    tuples to get HTMLParserXMLTreeBuilder to scrub your input the way you
     want.
     """
     reset_nesting_tags = {}
@@ -1313,7 +1313,7 @@ class XMLParserBuilder(HTMLParser, TreeBuilder):
         return j
 
 
-class HTMLParserBuilder(XMLParserBuilder):
+class HTMLParserTreeBuilder(HTMLParserXMLTreeBuilder):
     """This builder knows the following facts about HTML:
 
     * Some tags have no closing tag and should be interpreted as being
@@ -1411,7 +1411,7 @@ class HTMLParserBuilder(XMLParserBuilder):
     def __init__(self, *args, **kwargs):
         if not kwargs.has_key('smartQuotesTo'):
             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
-        XMLParserBuilder.__init__(self, *args, **kwargs)
+        HTMLParserXMLTreeBuilder.__init__(self, *args, **kwargs)
 
 
 class BeautifulStoneSoup(Tag):
@@ -1450,7 +1450,7 @@ class BeautifulStoneSoup(Tag):
     STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
 
     def _defaultBuilder(self):
-        return XMLParserBuilder()
+        return HTMLParserXMLTreeBuilder()
 
     def __init__(self, markup="", builder=None, parseOnlyThese=None,
                  fromEncoding=None):
@@ -1662,14 +1662,53 @@ class BeautifulStoneSoup(Tag):
 class BeautifulSoup(BeautifulStoneSoup):
     """A convenience class for parsing HTML without creating a builder."""
     def _defaultBuilder(self):
-        return HTMLParserBuilder()
+        return HTMLParserTreeBuilder()
 
-class ICantBelieveItsBeautifulSoup(BeautifulStoneSoup):
-    pass
 
 class StopParsing(Exception):
     pass
 
+
+class ICantBelieveItsValidHTMLBuilder(HTMLParserTreeBuilder):
+    i_cant_believe_theyre_nestable_inline_tags = \
+     ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
+      'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
+      'big']
+
+    i_cant_believe_theyre_nestable_block_tags = ['noscript']
+
+    nestable_tags = buildTagMap([], HTMLParserTreeBuilder.nestable_tags,
+                                i_cant_believe_theyre_nestable_block_tags,
+                                i_cant_believe_theyre_nestable_inline_tags)
+
+
+class ICantBelieveItsBeautifulSoup(BeautifulStoneSoup):
+    """The BeautifulSoup class is oriented towards skipping over
+    common HTML errors like unclosed tags. However, sometimes it makes
+    errors of its own. For instance, consider this fragment:
+
+     <b>Foo<b>Bar</b></b>
+
+    This is perfectly valid (if bizarre) HTML. However, the
+    BeautifulSoup class will implicitly close the first b tag when it
+    encounters the second 'b'. It will think the author wrote
+    "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
+    there's no real-world reason to bold something that's already
+    bold. When it encounters '</b></b>' it will close two more 'b'
+    tags, for a grand total of three tags closed instead of two. This
+    can throw off the rest of your document structure. The same is
+    true of a number of other tags, listed below.
+
+    It's much more common for someone to forget to close a 'b' tag
+    than to actually use nested 'b' tags, and the BeautifulSoup class
+    handles the common case. This class handles the not-co-common
+    case: where you can't believe someone wrote what they did, but
+    it's valid HTML and BeautifulSoup screwed up by assuming it
+    wouldn't be."""
+    def _defaultBuilder(self):
+        return ICantBelieveItsValidHTMLBuilder()
+
+
 ######################################################
 #
 # Bonus library: Unicode, Dammit
diff --git a/BeautifulSoupTests.py b/BeautifulSoupTests.py
index dcb08ef..721020d 100644
--- a/BeautifulSoupTests.py
+++ b/BeautifulSoupTests.py
@@ -280,7 +280,7 @@ class WriteOnlyCode(SoupTest):
     def testNewTagCreation(self):
         "Makes sure tags don't step on each others' toes."
         soup = BeautifulSoup()
-        builder = HTMLParserBuilder()
+        builder = HTMLParserTreeBuilder()
         a = Tag(soup, builder, 'a')
         ol = Tag(soup, builder, 'ol')
         a['href'] = 'http://foo.com/'
@@ -325,7 +325,7 @@ class WriteOnlyCode(SoupTest):
 
         # Even more complex
         soup = BeautifulSoup("<a><b>Find</b><c>lady!</c><d></d></a>")
-        builder = HTMLParserBuilder()
+        builder = HTMLParserTreeBuilder()
         tag = Tag(soup, builder, 'magictag')
         tag.insert(0, "the")
         soup.a.insert(1, tag)
@@ -527,7 +527,7 @@ class CleanupOnAisleFour(SoupTest):
         self.assertEqual(soup.decode(),
                          '<p>test1<selfclosing>test2</selfclosing></p>')
 
-        builder = XMLParserBuilder(selfClosingTags='selfclosing')
+        builder = HTMLParserXMLTreeBuilder(selfClosingTags='selfclosing')
         soup = BeautifulSoup(text, builder)
         self.assertEqual(soup.decode(),
                          '<p>test1<selfclosing />test2</p>')
@@ -583,9 +583,9 @@ class CleanupOnAisleFour(SoupTest):
         htmlEnt = Entities.HTML_ENTITIES
         xhtmlEnt = Entities.XHTML_ENTITIES
 
-        xmlBuilder = XMLParserBuilder(convertEntities=xmlEnt)
-        htmlBuilder = XMLParserBuilder(convertEntities=htmlEnt)
-        xhtmlBuilder = XMLParserBuilder(convertEntities=xhtmlEnt)
+        xmlBuilder = HTMLParserXMLTreeBuilder(convertEntities=xmlEnt)
+        htmlBuilder = HTMLParserXMLTreeBuilder(convertEntities=htmlEnt)
+        xhtmlBuilder = HTMLParserXMLTreeBuilder(convertEntities=xhtmlEnt)
 
         soup = BeautifulStoneSoup(text, xmlBuilder)
         self.assertEquals(soup.decode(), "<<sacr&eacute; bleu!>>")
@@ -608,7 +608,7 @@ class CleanupOnAisleFour(SoupTest):
         self.assertEquals(soup.decode(), u"<\u2122'")
 
     def testNonBreakingSpaces(self):
-        builder = HTMLParserBuilder(
+        builder = HTMLParserTreeBuilder(
             convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
         soup = BeautifulSoup("<a>&nbsp;&nbsp;</a>", builder)
         self.assertEquals(soup.decode(), u"<a>\xa0\xa0</a>")
@@ -631,7 +631,7 @@ class CleanupOnAisleFour(SoupTest):
         self.assertSoupEquals('<x t="x&#xf1;">', '<x t="x\xc3\xb1"></x>',
                               encoding='utf-8')
 
-        builder = HTMLParserBuilder(convertEntities=Entities.HTML_ENTITIES)
+        builder = HTMLParserTreeBuilder(convertEntities=Entities.HTML_ENTITIES)
         soup = BeautifulSoup('<x t="&gt;&trade;">', builder)
         self.assertEquals(soup.decode(), u'<x t="&gt;\u2122"></x>')
 
@@ -649,7 +649,7 @@ class CleanupOnAisleFour(SoupTest):
                           uri.replace("&eacute;", u"\xe9"))
 
     def testNakedAmpersands(self):
-        builder = XMLParserBuilder(convertEntities=Entities.HTML_ENTITIES)
+        builder = HTMLParserXMLTreeBuilder(convertEntities=Entities.HTML_ENTITIES)
         soup = BeautifulStoneSoup("AT&T ", builder)
         self.assertEquals(soup.decode(), 'AT&amp;T ')
 
@@ -811,7 +811,7 @@ class EncodeRed(SoupTest):
         soup = BeautifulSoup(smartQuotes)
         self.assertEquals(soup.decode(),
                           'Il a dit, &lsaquo;Sacr&eacute; bl&#101;u!&rsaquo;')
-        builder = HTMLParserBuilder(convertEntities="html")
+        builder = HTMLParserTreeBuilder(convertEntities="html")
         soup = BeautifulSoup(smartQuotes, builder)
         self.assertEquals(soup.encode('utf-8'),
                           'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba')
@@ -832,5 +832,19 @@ class Whitewash(SoupTest):
         self.assertSoupEquals("<p>   </p>", "<p> </p>")
 
 
+class AlternateBuilders(SoupTest):
+    """Test alternate builders."""
+
+    def testICantBelieveItsValidHTML(self):
+        builder = ICantBelieveItsValidHTMLBuilder()
+        markup = "<b>Foo<b>Bar</b></b>"
+
+        soup = BeautifulSoup(markup)
+        self.assertEquals(soup.decode(), "<b>Foo</b><b>Bar</b>")
+
+        soup = BeautifulSoup(markup, builder=builder)
+        self.assertEquals(soup.decode(), markup)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/lxml_builder.py b/lxml_builder.py
index 77e2f98..9929918 100644
--- a/lxml_builder.py
+++ b/lxml_builder.py
@@ -1,7 +1,7 @@
 from lxml import etree
 from BeautifulSoup import TreeBuilder, Comment
 
-class LXMLBuilder(TreeBuilder):
+class LXMLTreeBuilder(TreeBuilder):
 
     def __init__(self, parser_class=etree.HTMLParser, self_closing_tags=[]):
         self.parser = parser_class(target=self)
diff --git a/lxml_test.py b/lxml_test.py
index 35880fb..2e25c06 100644
--- a/lxml_test.py
+++ b/lxml_test.py
@@ -1,13 +1,13 @@
 from BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
-from lxml_builder import LXMLBuilder
+from lxml_builder import LXMLTreeBuilder
 from lxml import etree
-builder = LXMLBuilder()
+builder = LXMLTreeBuilder(parser_class=etree.XMLParser)
 soup = BeautifulStoneSoup("<foo>bar</foo>", builder=builder)
 print soup.prettify()
 
 soup = BeautifulSoup("<foo>bar</foo>", builder=builder)
 print soup.prettify()
 
-builder = LXMLBuilder(parser_class=etree.HTMLParser, self_closing_tags=["br"])
+builder = LXMLTreeBuilder(parser_class=etree.HTMLParser, self_closing_tags=['br'])
 soup = BeautifulSoup("<html><head><title>test<body><h1>page<!--Comment--><script>foo<b>bar</script><br />title</h1>", builder=builder)
 print soup.prettify()