diff options
-rw-r--r-- | bs4/__init__.py | 14 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 1 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 4 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 3 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 8 | ||||
-rw-r--r-- | bs4/testing.py | 2 | ||||
-rw-r--r-- | bs4/tests/test_builder_registry.py | 14 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 23 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 12 | ||||
-rw-r--r-- | setup.py | 1 |
10 files changed, 63 insertions, 19 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index a0049ad..34a72e4 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -77,6 +77,8 @@ class BeautifulSoup(Tag): ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' + NO_PARSER_SPECIFIED_WARNING = 'Parser was not explicitly specified. Using the best available parser for this system ("%s"). The same code on other systems may use a different parser and behave differently.' + def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, **kwargs): """The Soup object is initialized as the 'root tag', and the @@ -114,9 +116,9 @@ class BeautifulSoup(Tag): del kwargs['isHTML'] warnings.warn( "BS4 does not respect the isHTML argument to the " - "BeautifulSoup constructor. You can pass in features='html' " - "or features='xml' to get a builder capable of handling " - "one or the other.") + "BeautifulSoup constructor. Suggest you use " + "features='lxml' for HTML and features='lxml-xml' for " + "XML.") def deprecated_argument(old_name, new_name): if old_name in kwargs: @@ -140,6 +142,7 @@ class BeautifulSoup(Tag): "__init__() got an unexpected keyword argument '%s'" % arg) if builder is None: + original_features = features if isinstance(features, basestring): features = [features] if features is None or len(features) == 0: @@ -151,6 +154,11 @@ class BeautifulSoup(Tag): "requested: %s. Do you need to install a parser library?" % ",".join(features)) builder = builder_class() + if not (original_features == builder.NAME or + (not isinstance(builder.NAME, basestring) and + original_features in builder.NAME)): + warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % builder.NAME) + self.builder = builder self.is_xml = builder.is_xml self.builder.soup = self diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 740f5f2..0e84fae 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -80,6 +80,7 @@ builder_registry = TreeBuilderRegistry() class TreeBuilder(object): """Turn a document into a Beautiful Soup object tree.""" + NAME = "[Unknown tree builder]" features = [] is_xml = False diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 6446c2e..6013575 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -22,7 +22,9 @@ from bs4.element import ( class HTML5TreeBuilder(HTMLTreeBuilder): """Use html5lib to build a tree.""" - features = ['html5lib', PERMISSIVE, HTML_5, HTML] + NAME = "html5lib" + + features = [NAME, PERMISSIVE, HTML_5, HTML] def prepare_markup(self, markup, user_specified_encoding): # Store the user-specified encoding for use later on. diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index ca8d8b8..3e78c65 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -128,7 +128,8 @@ class BeautifulSoupHTMLParser(HTMLParser): class HTMLParserTreeBuilder(HTMLTreeBuilder): is_xml = False - features = [HTML, STRICT, HTMLPARSER] + NAME = HTMLPARSER + features = [NAME, HTML, STRICT] def __init__(self, *args, **kwargs): if CONSTRUCTOR_TAKES_STRICT: diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index fa5d498..110e9d2 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -25,8 +25,10 @@ class LXMLTreeBuilderForXML(TreeBuilder): is_xml = True + NAME = "lxml-xml" + # Well, it's permissive by XML parser standards. - features = [LXML, XML, FAST, PERMISSIVE] + features = [NAME, LXML, XML, FAST, PERMISSIVE] CHUNK_SIZE = 512 @@ -212,7 +214,9 @@ class LXMLTreeBuilderForXML(TreeBuilder): class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): - features = [LXML, HTML, FAST, PERMISSIVE] + NAME = [LXML, "lxml-html"] + + features = NAME + [HTML, FAST, PERMISSIVE] is_xml = False def default_parser(self, encoding): diff --git a/bs4/testing.py b/bs4/testing.py index ce207cf..3e700f3 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -495,7 +495,7 @@ class XMLTreeBuilderSmokeTest(object): <script type="text/javascript"> </script> """ - soup = BeautifulSoup(doc, "xml") + soup = BeautifulSoup(doc, "lxml-xml") # lxml would have stripped this while parsing, but we can add # it later. soup.script.string = 'console.log("< < hey > > ");' diff --git a/bs4/tests/test_builder_registry.py b/bs4/tests/test_builder_registry.py index 92ad10f..90cad82 100644 --- a/bs4/tests/test_builder_registry.py +++ b/bs4/tests/test_builder_registry.py @@ -1,6 +1,7 @@ """Tests of the builder registry.""" import unittest +import warnings from bs4 import BeautifulSoup from bs4.builder import ( @@ -67,10 +68,15 @@ class BuiltInRegistryTest(unittest.TestCase): HTMLParserTreeBuilder) def test_beautifulsoup_constructor_does_lookup(self): - # You can pass in a string. - BeautifulSoup("", features="html") - # Or a list of strings. - BeautifulSoup("", features=["html", "fast"]) + + with warnings.catch_warnings(record=True) as w: + # This will create a warning about not explicitly + # specifying a parser, but we'll ignore it. + + # You can pass in a string. + BeautifulSoup("", features="html") + # Or a list of strings. + BeautifulSoup("", features=["html", "fast"]) # You'll get an exception if BS can't find an appropriate # builder. diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 47ac245..b74a246 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -49,7 +49,28 @@ class TestConstructor(SoupTest): self.assertEqual(u"foo\0bar", soup.h1.string) -class TestDeprecatedConstructorArguments(SoupTest): +class TestWarnings(SoupTest): + + def _no_parser_specified(self, s, is_there=True): + v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80]) + self.assertTrue(v) + + def test_warning_if_no_parser_specified(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("<a><b></b></a>") + msg = str(w[0].message) + self._assert_no_parser_specified(msg) + + def test_warning_if_parser_specified_too_vague(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("<a><b></b></a>", "html") + msg = str(w[0].message) + self._assert_no_parser_specified(msg) + + def test_no_warning_if_explicit_parser_specified(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("<a><b></b></a>", "html.parser") + self.assertEquals([], w) def test_parseOnlyThese_renamed_to_parse_only(self): with warnings.catch_warnings(record=True) as w: diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index f8515c0..de9543d 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -688,7 +688,7 @@ class TestTagCreation(SoupTest): def test_tag_inherits_self_closing_rules_from_builder(self): if XML_BUILDER_PRESENT: - xml_soup = BeautifulSoup("", "xml") + xml_soup = BeautifulSoup("", "lxml-xml") xml_br = xml_soup.new_tag("br") xml_p = xml_soup.new_tag("p") @@ -697,7 +697,7 @@ class TestTagCreation(SoupTest): self.assertEqual(b"<br/>", xml_br.encode()) self.assertEqual(b"<p/>", xml_p.encode()) - html_soup = BeautifulSoup("", "html") + html_soup = BeautifulSoup("", "html.parser") html_br = html_soup.new_tag("br") html_p = html_soup.new_tag("p") @@ -1366,7 +1366,7 @@ class TestSubstitutions(SoupTest): console.log("< < hey > > "); </script> """ - encoded = BeautifulSoup(doc).encode() + encoded = BeautifulSoup(doc, 'html.parser').encode() self.assertTrue(b"< < hey > >" in encoded) def test_formatter_skips_style_tag_for_html_documents(self): @@ -1375,7 +1375,7 @@ class TestSubstitutions(SoupTest): console.log("< < hey > > "); </style> """ - encoded = BeautifulSoup(doc).encode() + encoded = BeautifulSoup(doc, 'html.parser').encode() self.assertTrue(b"< < hey > >" in encoded) def test_prettify_leaves_preformatted_text_alone(self): @@ -1387,7 +1387,7 @@ class TestSubstitutions(SoupTest): soup.div.prettify()) def test_prettify_accepts_formatter(self): - soup = BeautifulSoup("<html><body>foo</body></html>") + soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser') pretty = soup.prettify(formatter = lambda x: x.upper()) self.assertTrue("FOO" in pretty) @@ -1565,7 +1565,7 @@ class TestSoupSelector(TreeTest): """ def setUp(self): - self.soup = BeautifulSoup(self.HTML) + self.soup = BeautifulSoup(self.HTML, 'html.parser') def assertSelects(self, selector, expected_ids): el_ids = [el['id'] for el in self.soup.select(selector)] @@ -15,6 +15,7 @@ setup(name="beautifulsoup4", long_description="""Beautiful Soup sits atop an HTML or XML parser, providing Pythonic idioms for iterating, searching, and modifying the parse tree.""", license="MIT", packages=['bs4', 'bs4.builder', 'bs4.tests'], + install_requires=["lxml"], cmdclass = {'build_py':build_py}, classifiers=["Development Status :: 4 - Beta", "Intended Audience :: Developers", |