summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2014-12-07 09:31:30 -0500
committerLeonard Richardson <leonardr@segfault.org>2014-12-07 09:31:30 -0500
commitbf58c02abf418556927363cf79cc86bee58d0592 (patch)
tree747e9f5a6d6aa7fcce064ad44c9efb5e43fdca73
parenta7f63d509473e11a48ff3f9b2d8b37a19a7a25ef (diff)
Issue a warning if the BeautifulSoup constructor arguments do not explicitly name a parser.
-rw-r--r--bs4/__init__.py14
-rw-r--r--bs4/builder/__init__.py1
-rw-r--r--bs4/builder/_html5lib.py4
-rw-r--r--bs4/builder/_htmlparser.py3
-rw-r--r--bs4/builder/_lxml.py8
-rw-r--r--bs4/testing.py2
-rw-r--r--bs4/tests/test_builder_registry.py14
-rw-r--r--bs4/tests/test_soup.py23
-rw-r--r--bs4/tests/test_tree.py12
-rw-r--r--setup.py1
10 files changed, 63 insertions, 19 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index a0049ad..34a72e4 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -77,6 +77,8 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
+ NO_PARSER_SPECIFIED_WARNING = 'Parser was not explicitly specified. Using the best available parser for this system ("%s"). The same code on other systems may use a different parser and behave differently.'
+
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, **kwargs):
"""The Soup object is initialized as the 'root tag', and the
@@ -114,9 +116,9 @@ class BeautifulSoup(Tag):
del kwargs['isHTML']
warnings.warn(
"BS4 does not respect the isHTML argument to the "
- "BeautifulSoup constructor. You can pass in features='html' "
- "or features='xml' to get a builder capable of handling "
- "one or the other.")
+ "BeautifulSoup constructor. Suggest you use "
+ "features='lxml' for HTML and features='lxml-xml' for "
+ "XML.")
def deprecated_argument(old_name, new_name):
if old_name in kwargs:
@@ -140,6 +142,7 @@ class BeautifulSoup(Tag):
"__init__() got an unexpected keyword argument '%s'" % arg)
if builder is None:
+ original_features = features
if isinstance(features, basestring):
features = [features]
if features is None or len(features) == 0:
@@ -151,6 +154,11 @@ class BeautifulSoup(Tag):
"requested: %s. Do you need to install a parser library?"
% ",".join(features))
builder = builder_class()
+ if not (original_features == builder.NAME or
+ (not isinstance(builder.NAME, basestring) and
+ original_features in builder.NAME)):
+ warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % builder.NAME)
+
self.builder = builder
self.is_xml = builder.is_xml
self.builder.soup = self
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 740f5f2..0e84fae 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -80,6 +80,7 @@ builder_registry = TreeBuilderRegistry()
class TreeBuilder(object):
"""Turn a document into a Beautiful Soup object tree."""
+ NAME = "[Unknown tree builder]"
features = []
is_xml = False
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 6446c2e..6013575 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -22,7 +22,9 @@ from bs4.element import (
class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree."""
- features = ['html5lib', PERMISSIVE, HTML_5, HTML]
+ NAME = "html5lib"
+
+ features = [NAME, PERMISSIVE, HTML_5, HTML]
def prepare_markup(self, markup, user_specified_encoding):
# Store the user-specified encoding for use later on.
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index ca8d8b8..3e78c65 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -128,7 +128,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
class HTMLParserTreeBuilder(HTMLTreeBuilder):
is_xml = False
- features = [HTML, STRICT, HTMLPARSER]
+ NAME = HTMLPARSER
+ features = [NAME, HTML, STRICT]
def __init__(self, *args, **kwargs):
if CONSTRUCTOR_TAKES_STRICT:
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index fa5d498..110e9d2 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -25,8 +25,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
is_xml = True
+ NAME = "lxml-xml"
+
# Well, it's permissive by XML parser standards.
- features = [LXML, XML, FAST, PERMISSIVE]
+ features = [NAME, LXML, XML, FAST, PERMISSIVE]
CHUNK_SIZE = 512
@@ -212,7 +214,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
- features = [LXML, HTML, FAST, PERMISSIVE]
+ NAME = [LXML, "lxml-html"]
+
+ features = NAME + [HTML, FAST, PERMISSIVE]
is_xml = False
def default_parser(self, encoding):
diff --git a/bs4/testing.py b/bs4/testing.py
index ce207cf..3e700f3 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -495,7 +495,7 @@ class XMLTreeBuilderSmokeTest(object):
<script type="text/javascript">
</script>
"""
- soup = BeautifulSoup(doc, "xml")
+ soup = BeautifulSoup(doc, "lxml-xml")
# lxml would have stripped this while parsing, but we can add
# it later.
soup.script.string = 'console.log("< < hey > > ");'
diff --git a/bs4/tests/test_builder_registry.py b/bs4/tests/test_builder_registry.py
index 92ad10f..90cad82 100644
--- a/bs4/tests/test_builder_registry.py
+++ b/bs4/tests/test_builder_registry.py
@@ -1,6 +1,7 @@
"""Tests of the builder registry."""
import unittest
+import warnings
from bs4 import BeautifulSoup
from bs4.builder import (
@@ -67,10 +68,15 @@ class BuiltInRegistryTest(unittest.TestCase):
HTMLParserTreeBuilder)
def test_beautifulsoup_constructor_does_lookup(self):
- # You can pass in a string.
- BeautifulSoup("", features="html")
- # Or a list of strings.
- BeautifulSoup("", features=["html", "fast"])
+
+ with warnings.catch_warnings(record=True) as w:
+ # This will create a warning about not explicitly
+ # specifying a parser, but we'll ignore it.
+
+ # You can pass in a string.
+ BeautifulSoup("", features="html")
+ # Or a list of strings.
+ BeautifulSoup("", features=["html", "fast"])
# You'll get an exception if BS can't find an appropriate
# builder.
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 47ac245..b74a246 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -49,7 +49,28 @@ class TestConstructor(SoupTest):
self.assertEqual(u"foo\0bar", soup.h1.string)
-class TestDeprecatedConstructorArguments(SoupTest):
+class TestWarnings(SoupTest):
+
+ def _no_parser_specified(self, s, is_there=True):
+ v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
+ self.assertTrue(v)
+
+ def test_warning_if_no_parser_specified(self):
+ with warnings.catch_warnings(record=True) as w:
+ soup = self.soup("<a><b></b></a>")
+ msg = str(w[0].message)
+ self._assert_no_parser_specified(msg)
+
+ def test_warning_if_parser_specified_too_vague(self):
+ with warnings.catch_warnings(record=True) as w:
+ soup = self.soup("<a><b></b></a>", "html")
+ msg = str(w[0].message)
+ self._assert_no_parser_specified(msg)
+
+ def test_no_warning_if_explicit_parser_specified(self):
+ with warnings.catch_warnings(record=True) as w:
+ soup = self.soup("<a><b></b></a>", "html.parser")
+ self.assertEquals([], w)
def test_parseOnlyThese_renamed_to_parse_only(self):
with warnings.catch_warnings(record=True) as w:
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index f8515c0..de9543d 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -688,7 +688,7 @@ class TestTagCreation(SoupTest):
def test_tag_inherits_self_closing_rules_from_builder(self):
if XML_BUILDER_PRESENT:
- xml_soup = BeautifulSoup("", "xml")
+ xml_soup = BeautifulSoup("", "lxml-xml")
xml_br = xml_soup.new_tag("br")
xml_p = xml_soup.new_tag("p")
@@ -697,7 +697,7 @@ class TestTagCreation(SoupTest):
self.assertEqual(b"<br/>", xml_br.encode())
self.assertEqual(b"<p/>", xml_p.encode())
- html_soup = BeautifulSoup("", "html")
+ html_soup = BeautifulSoup("", "html.parser")
html_br = html_soup.new_tag("br")
html_p = html_soup.new_tag("p")
@@ -1366,7 +1366,7 @@ class TestSubstitutions(SoupTest):
console.log("< < hey > > ");
</script>
"""
- encoded = BeautifulSoup(doc).encode()
+ encoded = BeautifulSoup(doc, 'html.parser').encode()
self.assertTrue(b"< < hey > >" in encoded)
def test_formatter_skips_style_tag_for_html_documents(self):
@@ -1375,7 +1375,7 @@ class TestSubstitutions(SoupTest):
console.log("< < hey > > ");
</style>
"""
- encoded = BeautifulSoup(doc).encode()
+ encoded = BeautifulSoup(doc, 'html.parser').encode()
self.assertTrue(b"< < hey > >" in encoded)
def test_prettify_leaves_preformatted_text_alone(self):
@@ -1387,7 +1387,7 @@ class TestSubstitutions(SoupTest):
soup.div.prettify())
def test_prettify_accepts_formatter(self):
- soup = BeautifulSoup("<html><body>foo</body></html>")
+ soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
pretty = soup.prettify(formatter = lambda x: x.upper())
self.assertTrue("FOO" in pretty)
@@ -1565,7 +1565,7 @@ class TestSoupSelector(TreeTest):
"""
def setUp(self):
- self.soup = BeautifulSoup(self.HTML)
+ self.soup = BeautifulSoup(self.HTML, 'html.parser')
def assertSelects(self, selector, expected_ids):
el_ids = [el['id'] for el in self.soup.select(selector)]
diff --git a/setup.py b/setup.py
index 0142ea0..75ba506 100644
--- a/setup.py
+++ b/setup.py
@@ -15,6 +15,7 @@ setup(name="beautifulsoup4",
long_description="""Beautiful Soup sits atop an HTML or XML parser, providing Pythonic idioms for iterating, searching, and modifying the parse tree.""",
license="MIT",
packages=['bs4', 'bs4.builder', 'bs4.tests'],
+ install_requires=["lxml"],
cmdclass = {'build_py':build_py},
classifiers=["Development Status :: 4 - Beta",
"Intended Audience :: Developers",