diff options
-rw-r--r-- | NEWS.txt | 6 | ||||
-rw-r--r-- | TODO.txt | 29 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 3 | ||||
-rw-r--r-- | bs4/tests/test_html5lib.py | 10 |
4 files changed, 23 insertions, 25 deletions
@@ -1,4 +1,4 @@ -= 4.0.0b8 () = += 4.0.0b8 (20110224) = * All tree builders now preserve namespace information in the documents they parse. @@ -10,6 +10,10 @@ * The string representation of a DOCTYPE always ends in a newline. +* Issue a warning if the user tries to use a SoupStrainer in + conjunction with the html5lib tree builder, which doesn't support + them. + = 4.0.0b7 (20110223) = * Upon decoding to string, any characters that can't be represented in @@ -1,29 +1,14 @@ -Bugs ----- - -* html5lib doesn't support SoupStrainers, which is OK, but there - should be a warning about it. - -Big features ------------- - -* Add namespace support. - Optimizations ------------- -markup_attr_map can be optimized since it's always a map now. - -BS3 features not yet ported ---------------------------- +The html5lib tree builder doesn't use the standard tree-building API, +which worries me. (This may also be why the tree builder doesn't +support SoupStrainers, but I think that has more to do with the fact +that the html5lib tree builder is constantly rearranging the tree, and +will crash if something it parsed earlier didn't actually make it into +the tree.) -* In BS3, "soup.aTag" is the same as 'soup.find("a")'. This lets you -locate a tag called (let's say) "find" with attribute -access. "soup.find" won't do what you want, but "soup.findTag" will. - -This still works In BS4 but it's deprecated. I could make -"soup.find_tag" work the same way as "soup.find('find')", but I don't -think it's worth it. +markup_attr_map can be optimized since it's always a map now. CDATA ----- diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 26b1773..cf716df 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -2,6 +2,7 @@ __all__ = [ 'HTML5TreeBuilder', ] +import warnings from bs4.builder import ( PERMISSIVE, HTML, @@ -30,6 +31,8 @@ class HTML5TreeBuilder(HTMLTreeBuilder): # These methods are defined by Beautiful Soup. def feed(self, markup): + if self.soup.parse_only is not None: + warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) doc = parser.parse(markup, encoding=self.user_specified_encoding) diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index 0828cfd..f195f7d 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -1,5 +1,7 @@ """Tests to ensure that the html5lib tree builder generates good trees.""" +import warnings + try: from bs4.builder import HTML5TreeBuilder HTML5LIB_PRESENT = True @@ -26,11 +28,15 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): # The html5lib tree builder does not support SoupStrainers. strainer = SoupStrainer("b") markup = "<p>A <b>bold</b> statement.</p>" - soup = self.soup(markup, - parse_only=strainer) + with warnings.catch_warnings(record=True) as w: + soup = self.soup(markup, parse_only=strainer) self.assertEqual( soup.decode(), self.document_for(markup)) + self.assertTrue( + "the html5lib tree builder doesn't support parse_only" in + str(w[0].message)) + def test_correctly_nested_tables(self): """html5lib inserts <tbody> tags where other parsers don't.""" markup = ('<table id="1">' |