summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-02-24 10:37:47 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2012-02-24 10:37:47 -0500
commit2966334d384946c16a104b6c9964a1999b23b838 (patch)
tree8bb022bfada0c44920f23e536686e3893ee6de79
parent97b54c4bdbee0f109c444b50d8102ae8d7abb7c4 (diff)
Warn when SoupStrainer is used with the html5lib tree builder.
-rw-r--r--NEWS.txt6
-rw-r--r--TODO.txt29
-rw-r--r--bs4/builder/_html5lib.py3
-rw-r--r--bs4/tests/test_html5lib.py10
4 files changed, 23 insertions, 25 deletions
diff --git a/NEWS.txt b/NEWS.txt
index c93541e..bcffd61 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,4 +1,4 @@
-= 4.0.0b8 () =
+= 4.0.0b8 (20110224) =
* All tree builders now preserve namespace information in the
documents they parse.
@@ -10,6 +10,10 @@
* The string representation of a DOCTYPE always ends in a newline.
+* Issue a warning if the user tries to use a SoupStrainer in
+ conjunction with the html5lib tree builder, which doesn't support
+ them.
+
= 4.0.0b7 (20110223) =
* Upon decoding to string, any characters that can't be represented in
diff --git a/TODO.txt b/TODO.txt
index 61f9aee..e57d799 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,29 +1,14 @@
-Bugs
-----
-
-* html5lib doesn't support SoupStrainers, which is OK, but there
- should be a warning about it.
-
-Big features
-------------
-
-* Add namespace support.
-
Optimizations
-------------
-markup_attr_map can be optimized since it's always a map now.
-
-BS3 features not yet ported
----------------------------
+The html5lib tree builder doesn't use the standard tree-building API,
+which worries me. (This may also be why the tree builder doesn't
+support SoupStrainers, but I think that has more to do with the fact
+that the html5lib tree builder is constantly rearranging the tree, and
+will crash if something it parsed earlier didn't actually make it into
+the tree.)
-* In BS3, "soup.aTag" is the same as 'soup.find("a")'. This lets you
-locate a tag called (let's say) "find" with attribute
-access. "soup.find" won't do what you want, but "soup.findTag" will.
-
-This still works In BS4 but it's deprecated. I could make
-"soup.find_tag" work the same way as "soup.find('find')", but I don't
-think it's worth it.
+markup_attr_map can be optimized since it's always a map now.
CDATA
-----
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 26b1773..cf716df 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -2,6 +2,7 @@ __all__ = [
'HTML5TreeBuilder',
]
+import warnings
from bs4.builder import (
PERMISSIVE,
HTML,
@@ -30,6 +31,8 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# These methods are defined by Beautiful Soup.
def feed(self, markup):
+ if self.soup.parse_only is not None:
+ warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
doc = parser.parse(markup, encoding=self.user_specified_encoding)
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index 0828cfd..f195f7d 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -1,5 +1,7 @@
"""Tests to ensure that the html5lib tree builder generates good trees."""
+import warnings
+
try:
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
@@ -26,11 +28,15 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
# The html5lib tree builder does not support SoupStrainers.
strainer = SoupStrainer("b")
markup = "<p>A <b>bold</b> statement.</p>"
- soup = self.soup(markup,
- parse_only=strainer)
+ with warnings.catch_warnings(record=True) as w:
+ soup = self.soup(markup, parse_only=strainer)
self.assertEqual(
soup.decode(), self.document_for(markup))
+ self.assertTrue(
+ "the html5lib tree builder doesn't support parse_only" in
+ str(w[0].message))
+
def test_correctly_nested_tables(self):
"""html5lib inserts <tbody> tags where other parsers don't."""
markup = ('<table id="1">'