diff options
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/__init__.py | 19 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 28 | ||||
-rw-r--r-- | setup.py | 2 |
4 files changed, 49 insertions, 3 deletions
@@ -16,6 +16,9 @@ redundant code has been removed from Unicode, Dammit, and some undocumented features have also been removed. +* Beautiful Soup will issue a warning if instead of markup you pass it + a URL or the name of a file on disk (a common beginner mistake). + = 4.2.1 (20130531) = * The default XML formatter will now replace ampersands even if they diff --git a/bs4/__init__.py b/bs4/__init__.py index 7b5964a..0dded3a 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -17,12 +17,13 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.2.1" +__version__ = "4.3.0" __copyright__ = "Copyright (c) 2004-2013 Leonard Richardson" __license__ = "MIT" __all__ = ['BeautifulSoup'] +import os import re import warnings @@ -162,6 +163,22 @@ class BeautifulSoup(Tag): if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() + elif len(markup) <= 256: + # Print out warnings for a couple beginner problems + # involving passing non-markup to Beautiful Soup. + # Beautiful Soup will still parse the input as markup, + # just in case that's what the user really wants. + if os.path.exists(markup): + warnings.warn( + '"%s" looks like a filename, not markup. You should probably open a filehandle and pass the filehandle into Beautiful Soup.' % markup) + if markup[:5] == "http:" or markup[:6] == "https:": + # TODO: This is ugly but I couldn't get it to work in + # Python 3 otherwise. + if ((isinstance(markup, bytes) and not b' ' in markup) + or (isinstance(markup, unicode) and not u' ' in markup)): + warnings.warn( + '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) + for (self.markup, self.original_encoding, self.declared_html_encoding, self.contains_replacement_characters) in ( self.builder.prepare_markup(markup, from_encoding)): diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 0b69318..860b17b 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -4,6 +4,8 @@ import logging import unittest import sys +import tempfile + from bs4 import ( BeautifulSoup, BeautifulStoneSoup, @@ -64,7 +66,31 @@ class TestDeprecatedConstructorArguments(SoupTest): with warnings.catch_warnings(record=True) as w: soup = BeautifulStoneSoup("<markup>") self.assertTrue(isinstance(soup, BeautifulSoup)) - self.assertTrue("BeautifulStoneSoup class is deprecated") + self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) + +class TestWarnings(SoupTest): + + def test_disk_file_warning(self): + filehandle = tempfile.NamedTemporaryFile() + filename = filehandle.name + try: + with warnings.catch_warnings(record=True) as w: + soup = self.soup(filename) + msg = str(w[0].message) + self.assertTrue("looks like a filename" in msg) + finally: + filehandle.close() + + def test_url_warning(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("http://www.crummy.com/") + msg = str(w[0].message) + self.assertTrue("looks like a URL" in msg) + + with warnings.catch_warnings(record=True) as w: + soup = self.soup("http://www.crummy.com/ is great") + self.assertEqual(0, len(w)) + class TestSelectiveParsing(SoupTest): @@ -7,7 +7,7 @@ except ImportError: from distutils.command.build_py import build_py setup(name="beautifulsoup4", - version = "4.2.1", + version = "4.3.0", author="Leonard Richardson", author_email='leonardr@segfault.org', url="http://www.crummy.com/software/BeautifulSoup/bs4/", |