diff options
-rw-r--r-- | BeautifulSoup.py | 55 | ||||
-rw-r--r-- | BeautifulSoupTests.py | 34 | ||||
-rw-r--r-- | lxml_builder.py | 2 | ||||
-rw-r--r-- | lxml_test.py | 6 |
4 files changed, 75 insertions, 22 deletions
diff --git a/BeautifulSoup.py b/BeautifulSoup.py index aceb6d3..c0f7482 100644 --- a/BeautifulSoup.py +++ b/BeautifulSoup.py @@ -1029,7 +1029,7 @@ class TreeBuilder(Entities): pass -class XMLParserBuilder(HTMLParser, TreeBuilder): +class HTMLParserXMLTreeBuilder(HTMLParser, TreeBuilder): """ This class defines a basic tree builder based on Python's built-in @@ -1066,7 +1066,7 @@ class XMLParserBuilder(HTMLParser, TreeBuilder): <! --Comment--> (Extraneous whitespace in declaration) You can pass in a custom list of (RE object, replace method) - tuples to get XMLParserBuilder to scrub your input the way you + tuples to get HTMLParserXMLTreeBuilder to scrub your input the way you want. """ reset_nesting_tags = {} @@ -1313,7 +1313,7 @@ class XMLParserBuilder(HTMLParser, TreeBuilder): return j -class HTMLParserBuilder(XMLParserBuilder): +class HTMLParserTreeBuilder(HTMLParserXMLTreeBuilder): """This builder knows the following facts about HTML: * Some tags have no closing tag and should be interpreted as being @@ -1411,7 +1411,7 @@ class HTMLParserBuilder(XMLParserBuilder): def __init__(self, *args, **kwargs): if not kwargs.has_key('smartQuotesTo'): kwargs['smartQuotesTo'] = self.HTML_ENTITIES - XMLParserBuilder.__init__(self, *args, **kwargs) + HTMLParserXMLTreeBuilder.__init__(self, *args, **kwargs) class BeautifulStoneSoup(Tag): @@ -1450,7 +1450,7 @@ class BeautifulStoneSoup(Tag): STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } def _defaultBuilder(self): - return XMLParserBuilder() + return HTMLParserXMLTreeBuilder() def __init__(self, markup="", builder=None, parseOnlyThese=None, fromEncoding=None): @@ -1662,14 +1662,53 @@ class BeautifulStoneSoup(Tag): class BeautifulSoup(BeautifulStoneSoup): """A convenience class for parsing HTML without creating a builder.""" def _defaultBuilder(self): - return HTMLParserBuilder() + return HTMLParserTreeBuilder() -class ICantBelieveItsBeautifulSoup(BeautifulStoneSoup): - pass class StopParsing(Exception): pass + +class ICantBelieveItsValidHTMLBuilder(HTMLParserTreeBuilder): + i_cant_believe_theyre_nestable_inline_tags = \ + ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', + 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', + 'big'] + + i_cant_believe_theyre_nestable_block_tags = ['noscript'] + + nestable_tags = buildTagMap([], HTMLParserTreeBuilder.nestable_tags, + i_cant_believe_theyre_nestable_block_tags, + i_cant_believe_theyre_nestable_inline_tags) + + +class ICantBelieveItsBeautifulSoup(BeautifulStoneSoup): + """The BeautifulSoup class is oriented towards skipping over + common HTML errors like unclosed tags. However, sometimes it makes + errors of its own. For instance, consider this fragment: + + <b>Foo<b>Bar</b></b> + + This is perfectly valid (if bizarre) HTML. However, the + BeautifulSoup class will implicitly close the first b tag when it + encounters the second 'b'. It will think the author wrote + "<b>Foo<b>Bar", and didn't close the first 'b' tag, because + there's no real-world reason to bold something that's already + bold. When it encounters '</b></b>' it will close two more 'b' + tags, for a grand total of three tags closed instead of two. This + can throw off the rest of your document structure. The same is + true of a number of other tags, listed below. + + It's much more common for someone to forget to close a 'b' tag + than to actually use nested 'b' tags, and the BeautifulSoup class + handles the common case. This class handles the not-co-common + case: where you can't believe someone wrote what they did, but + it's valid HTML and BeautifulSoup screwed up by assuming it + wouldn't be.""" + def _defaultBuilder(self): + return ICantBelieveItsValidHTMLBuilder() + + ###################################################### # # Bonus library: Unicode, Dammit diff --git a/BeautifulSoupTests.py b/BeautifulSoupTests.py index dcb08ef..721020d 100644 --- a/BeautifulSoupTests.py +++ b/BeautifulSoupTests.py @@ -280,7 +280,7 @@ class WriteOnlyCode(SoupTest): def testNewTagCreation(self): "Makes sure tags don't step on each others' toes." soup = BeautifulSoup() - builder = HTMLParserBuilder() + builder = HTMLParserTreeBuilder() a = Tag(soup, builder, 'a') ol = Tag(soup, builder, 'ol') a['href'] = 'http://foo.com/' @@ -325,7 +325,7 @@ class WriteOnlyCode(SoupTest): # Even more complex soup = BeautifulSoup("<a><b>Find</b><c>lady!</c><d></d></a>") - builder = HTMLParserBuilder() + builder = HTMLParserTreeBuilder() tag = Tag(soup, builder, 'magictag') tag.insert(0, "the") soup.a.insert(1, tag) @@ -527,7 +527,7 @@ class CleanupOnAisleFour(SoupTest): self.assertEqual(soup.decode(), '<p>test1<selfclosing>test2</selfclosing></p>') - builder = XMLParserBuilder(selfClosingTags='selfclosing') + builder = HTMLParserXMLTreeBuilder(selfClosingTags='selfclosing') soup = BeautifulSoup(text, builder) self.assertEqual(soup.decode(), '<p>test1<selfclosing />test2</p>') @@ -583,9 +583,9 @@ class CleanupOnAisleFour(SoupTest): htmlEnt = Entities.HTML_ENTITIES xhtmlEnt = Entities.XHTML_ENTITIES - xmlBuilder = XMLParserBuilder(convertEntities=xmlEnt) - htmlBuilder = XMLParserBuilder(convertEntities=htmlEnt) - xhtmlBuilder = XMLParserBuilder(convertEntities=xhtmlEnt) + xmlBuilder = HTMLParserXMLTreeBuilder(convertEntities=xmlEnt) + htmlBuilder = HTMLParserXMLTreeBuilder(convertEntities=htmlEnt) + xhtmlBuilder = HTMLParserXMLTreeBuilder(convertEntities=xhtmlEnt) soup = BeautifulStoneSoup(text, xmlBuilder) self.assertEquals(soup.decode(), "<<sacré bleu!>>") @@ -608,7 +608,7 @@ class CleanupOnAisleFour(SoupTest): self.assertEquals(soup.decode(), u"<\u2122'") def testNonBreakingSpaces(self): - builder = HTMLParserBuilder( + builder = HTMLParserTreeBuilder( convertEntities=BeautifulStoneSoup.HTML_ENTITIES) soup = BeautifulSoup("<a> </a>", builder) self.assertEquals(soup.decode(), u"<a>\xa0\xa0</a>") @@ -631,7 +631,7 @@ class CleanupOnAisleFour(SoupTest): self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', encoding='utf-8') - builder = HTMLParserBuilder(convertEntities=Entities.HTML_ENTITIES) + builder = HTMLParserTreeBuilder(convertEntities=Entities.HTML_ENTITIES) soup = BeautifulSoup('<x t=">™">', builder) self.assertEquals(soup.decode(), u'<x t=">\u2122"></x>') @@ -649,7 +649,7 @@ class CleanupOnAisleFour(SoupTest): uri.replace("é", u"\xe9")) def testNakedAmpersands(self): - builder = XMLParserBuilder(convertEntities=Entities.HTML_ENTITIES) + builder = HTMLParserXMLTreeBuilder(convertEntities=Entities.HTML_ENTITIES) soup = BeautifulStoneSoup("AT&T ", builder) self.assertEquals(soup.decode(), 'AT&T ') @@ -811,7 +811,7 @@ class EncodeRed(SoupTest): soup = BeautifulSoup(smartQuotes) self.assertEquals(soup.decode(), 'Il a dit, ‹Sacré bleu!›') - builder = HTMLParserBuilder(convertEntities="html") + builder = HTMLParserTreeBuilder(convertEntities="html") soup = BeautifulSoup(smartQuotes, builder) self.assertEquals(soup.encode('utf-8'), 'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') @@ -832,5 +832,19 @@ class Whitewash(SoupTest): self.assertSoupEquals("<p> </p>", "<p> </p>") +class AlternateBuilders(SoupTest): + """Test alternate builders.""" + + def testICantBelieveItsValidHTML(self): + builder = ICantBelieveItsValidHTMLBuilder() + markup = "<b>Foo<b>Bar</b></b>" + + soup = BeautifulSoup(markup) + self.assertEquals(soup.decode(), "<b>Foo</b><b>Bar</b>") + + soup = BeautifulSoup(markup, builder=builder) + self.assertEquals(soup.decode(), markup) + + if __name__ == '__main__': unittest.main() diff --git a/lxml_builder.py b/lxml_builder.py index 77e2f98..9929918 100644 --- a/lxml_builder.py +++ b/lxml_builder.py @@ -1,7 +1,7 @@ from lxml import etree from BeautifulSoup import TreeBuilder, Comment -class LXMLBuilder(TreeBuilder): +class LXMLTreeBuilder(TreeBuilder): def __init__(self, parser_class=etree.HTMLParser, self_closing_tags=[]): self.parser = parser_class(target=self) diff --git a/lxml_test.py b/lxml_test.py index 35880fb..2e25c06 100644 --- a/lxml_test.py +++ b/lxml_test.py @@ -1,13 +1,13 @@ from BeautifulSoup import BeautifulStoneSoup, BeautifulSoup -from lxml_builder import LXMLBuilder +from lxml_builder import LXMLTreeBuilder from lxml import etree -builder = LXMLBuilder() +builder = LXMLTreeBuilder(parser_class=etree.XMLParser) soup = BeautifulStoneSoup("<foo>bar</foo>", builder=builder) print soup.prettify() soup = BeautifulSoup("<foo>bar</foo>", builder=builder) print soup.prettify() -builder = LXMLBuilder(parser_class=etree.HTMLParser, self_closing_tags=["br"]) +builder = LXMLTreeBuilder(parser_class=etree.HTMLParser, self_closing_tags=['br']) soup = BeautifulSoup("<html><head><title>test<body><h1>page<!--Comment--><script>foo<b>bar</script><br />title</h1>", builder=builder) print soup.prettify() |