Issue a warning if the BeautifulSoup constructor arguments do not explicitly name a parser.

author: Leonard Richardson <leonardr@segfault.org> 2014-12-07 09:31:30 -0500
committer: Leonard Richardson <leonardr@segfault.org> 2014-12-07 09:31:30 -0500
commit: bf58c02abf418556927363cf79cc86bee58d0592 (patch)
tree: 747e9f5a6d6aa7fcce064ad44c9efb5e43fdca73
parent: a7f63d509473e11a48ff3f9b2d8b37a19a7a25ef (diff)
10 files changed, 63 insertions, 19 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index a0049ad..34a72e4 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -77,6 +77,8 @@ class BeautifulSoup(Tag):
 
     ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
 
+    NO_PARSER_SPECIFIED_WARNING = 'Parser was not explicitly specified. Using the best available parser for this system ("%s"). The same code on other systems may use a different parser and behave differently.'
+
     def __init__(self, markup="", features=None, builder=None,
                  parse_only=None, from_encoding=None, **kwargs):
         """The Soup object is initialized as the 'root tag', and the
@@ -114,9 +116,9 @@ class BeautifulSoup(Tag):
             del kwargs['isHTML']
             warnings.warn(
                 "BS4 does not respect the isHTML argument to the "
-                "BeautifulSoup constructor. You can pass in features='html' "
-                "or features='xml' to get a builder capable of handling "
-                "one or the other.")
+                "BeautifulSoup constructor. Suggest you use "
+                "features='lxml' for HTML and features='lxml-xml' for "
+                "XML.")
 
         def deprecated_argument(old_name, new_name):
             if old_name in kwargs:
@@ -140,6 +142,7 @@ class BeautifulSoup(Tag):
                 "__init__() got an unexpected keyword argument '%s'" % arg)
 
         if builder is None:
+            original_features = features
             if isinstance(features, basestring):
                 features = [features]
             if features is None or len(features) == 0:
@@ -151,6 +154,11 @@ class BeautifulSoup(Tag):
                     "requested: %s. Do you need to install a parser library?"
                     % ",".join(features))
             builder = builder_class()
+            if not (original_features == builder.NAME or
+                    (not isinstance(builder.NAME, basestring) and
+                     original_features in builder.NAME)):
+                warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % builder.NAME)
+
         self.builder = builder
         self.is_xml = builder.is_xml
         self.builder.soup = self
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 740f5f2..0e84fae 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -80,6 +80,7 @@ builder_registry = TreeBuilderRegistry()
 class TreeBuilder(object):
     """Turn a document into a Beautiful Soup object tree."""
 
+    NAME = "[Unknown tree builder]"
     features = []
 
     is_xml = False
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 6446c2e..6013575 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -22,7 +22,9 @@ from bs4.element import (
 class HTML5TreeBuilder(HTMLTreeBuilder):
     """Use html5lib to build a tree."""
 
-    features = ['html5lib', PERMISSIVE, HTML_5, HTML]
+    NAME = "html5lib"
+
+    features = [NAME, PERMISSIVE, HTML_5, HTML]
 
     def prepare_markup(self, markup, user_specified_encoding):
         # Store the user-specified encoding for use later on.
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index ca8d8b8..3e78c65 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -128,7 +128,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
 class HTMLParserTreeBuilder(HTMLTreeBuilder):
 
     is_xml = False
-    features = [HTML, STRICT, HTMLPARSER]
+    NAME = HTMLPARSER
+    features = [NAME, HTML, STRICT]
 
     def __init__(self, *args, **kwargs):
         if CONSTRUCTOR_TAKES_STRICT:
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index fa5d498..110e9d2 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -25,8 +25,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
 
     is_xml = True
 
+    NAME = "lxml-xml"
+
     # Well, it's permissive by XML parser standards.
-    features = [LXML, XML, FAST, PERMISSIVE]
+    features = [NAME, LXML, XML, FAST, PERMISSIVE]
 
     CHUNK_SIZE = 512
 
@@ -212,7 +214,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
 
 class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
 
-    features = [LXML, HTML, FAST, PERMISSIVE]
+    NAME = [LXML, "lxml-html"]
+
+    features = NAME + [HTML, FAST, PERMISSIVE]
     is_xml = False
 
     def default_parser(self, encoding):
diff --git a/bs4/testing.py b/bs4/testing.py
index ce207cf..3e700f3 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -495,7 +495,7 @@ class XMLTreeBuilderSmokeTest(object):
   <script type="text/javascript">
   </script>
 """
-        soup = BeautifulSoup(doc, "xml")
+        soup = BeautifulSoup(doc, "lxml-xml")
         # lxml would have stripped this while parsing, but we can add
         # it later.
         soup.script.string = 'console.log("< < hey > > ");'
diff --git a/bs4/tests/test_builder_registry.py b/bs4/tests/test_builder_registry.py
index 92ad10f..90cad82 100644
--- a/bs4/tests/test_builder_registry.py
+++ b/bs4/tests/test_builder_registry.py
@@ -1,6 +1,7 @@
 """Tests of the builder registry."""
 
 import unittest
+import warnings
 
 from bs4 import BeautifulSoup
 from bs4.builder import (
@@ -67,10 +68,15 @@ class BuiltInRegistryTest(unittest.TestCase):
                           HTMLParserTreeBuilder)
 
     def test_beautifulsoup_constructor_does_lookup(self):
-        # You can pass in a string.
-        BeautifulSoup("", features="html")
-        # Or a list of strings.
-        BeautifulSoup("", features=["html", "fast"])
+
+        with warnings.catch_warnings(record=True) as w:
+            # This will create a warning about not explicitly
+            # specifying a parser, but we'll ignore it.
+
+            # You can pass in a string.
+            BeautifulSoup("", features="html")
+            # Or a list of strings.
+            BeautifulSoup("", features=["html", "fast"])
 
         # You'll get an exception if BS can't find an appropriate
         # builder.
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 47ac245..b74a246 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -49,7 +49,28 @@ class TestConstructor(SoupTest):
         self.assertEqual(u"foo\0bar", soup.h1.string)
 
 
-class TestDeprecatedConstructorArguments(SoupTest):
+class TestWarnings(SoupTest):
+
+    def _no_parser_specified(self, s, is_there=True):
+        v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
+        self.assertTrue(v)
+
+    def test_warning_if_no_parser_specified(self):
+        with warnings.catch_warnings(record=True) as w:
+            soup = self.soup("<a><b></b></a>")
+        msg = str(w[0].message)
+        self._assert_no_parser_specified(msg)
+
+    def test_warning_if_parser_specified_too_vague(self):
+        with warnings.catch_warnings(record=True) as w:
+            soup = self.soup("<a><b></b></a>", "html")
+        msg = str(w[0].message)
+        self._assert_no_parser_specified(msg)
+
+    def test_no_warning_if_explicit_parser_specified(self):
+        with warnings.catch_warnings(record=True) as w:
+            soup = self.soup("<a><b></b></a>", "html.parser")
+        self.assertEquals([], w)
 
     def test_parseOnlyThese_renamed_to_parse_only(self):
         with warnings.catch_warnings(record=True) as w:
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index f8515c0..de9543d 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -688,7 +688,7 @@ class TestTagCreation(SoupTest):
 
     def test_tag_inherits_self_closing_rules_from_builder(self):
         if XML_BUILDER_PRESENT:
-            xml_soup = BeautifulSoup("", "xml")
+            xml_soup = BeautifulSoup("", "lxml-xml")
             xml_br = xml_soup.new_tag("br")
             xml_p = xml_soup.new_tag("p")
 
@@ -697,7 +697,7 @@ class TestTagCreation(SoupTest):
             self.assertEqual(b"<br/>", xml_br.encode())
             self.assertEqual(b"<p/>", xml_p.encode())
 
-        html_soup = BeautifulSoup("", "html")
+        html_soup = BeautifulSoup("", "html.parser")
         html_br = html_soup.new_tag("br")
         html_p = html_soup.new_tag("p")
 
@@ -1366,7 +1366,7 @@ class TestSubstitutions(SoupTest):
    console.log("< < hey > > ");
   </script>
 """
-        encoded = BeautifulSoup(doc).encode()
+        encoded = BeautifulSoup(doc, 'html.parser').encode()
         self.assertTrue(b"< < hey > >" in encoded)
 
     def test_formatter_skips_style_tag_for_html_documents(self):
@@ -1375,7 +1375,7 @@ class TestSubstitutions(SoupTest):
    console.log("< < hey > > ");
   </style>
 """
-        encoded = BeautifulSoup(doc).encode()
+        encoded = BeautifulSoup(doc, 'html.parser').encode()
         self.assertTrue(b"< < hey > >" in encoded)
 
     def test_prettify_leaves_preformatted_text_alone(self):
@@ -1387,7 +1387,7 @@ class TestSubstitutions(SoupTest):
             soup.div.prettify())
 
     def test_prettify_accepts_formatter(self):
-        soup = BeautifulSoup("<html><body>foo</body></html>")
+        soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
         pretty = soup.prettify(formatter = lambda x: x.upper())
         self.assertTrue("FOO" in pretty)
 
@@ -1565,7 +1565,7 @@ class TestSoupSelector(TreeTest):
 """
 
     def setUp(self):
-        self.soup = BeautifulSoup(self.HTML)
+        self.soup = BeautifulSoup(self.HTML, 'html.parser')
 
     def assertSelects(self, selector, expected_ids):
         el_ids = [el['id'] for el in self.soup.select(selector)]
diff --git a/setup.py b/setup.py
index 0142ea0..75ba506 100644
--- a/setup.py
+++ b/setup.py
@@ -15,6 +15,7 @@ setup(name="beautifulsoup4",
       long_description="""Beautiful Soup sits atop an HTML or XML parser, providing Pythonic idioms for iterating, searching, and modifying the parse tree.""",
       license="MIT",
       packages=['bs4', 'bs4.builder', 'bs4.tests'],
+      install_requires=["lxml"],
       cmdclass = {'build_py':build_py},
       classifiers=["Development Status :: 4 - Beta",
                    "Intended Audience :: Developers",
author	Leonard Richardson <leonardr@segfault.org>	2014-12-07 09:31:30 -0500
committer	Leonard Richardson <leonardr@segfault.org>	2014-12-07 09:31:30 -0500
commit	bf58c02abf418556927363cf79cc86bee58d0592 (patch)
tree	747e9f5a6d6aa7fcce064ad44c9efb5e43fdca73
parent	a7f63d509473e11a48ff3f9b2d8b37a19a7a25ef (diff)