summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2013-06-03 07:34:30 -0400
committerLeonard Richardson <leonard.richardson@canonical.com>2013-06-03 07:34:30 -0400
commit9949d66d18f14623adf57fb4cef41b0282bb8672 (patch)
treed2cfb8eb5fbe52b06b93d15e4645f8b607ccea35
parent4a9444ac0b74fbf84cf86b9fcf6055c85e65f62a (diff)
Beautiful Soup will issue a warning if instead of markup you pass it
a URL or the name of a file on disk (a common beginner mistake).
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/__init__.py19
-rw-r--r--bs4/tests/test_soup.py28
-rw-r--r--setup.py2
4 files changed, 49 insertions, 3 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 3d0846f..7b801c9 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -16,6 +16,9 @@
redundant code has been removed from Unicode, Dammit, and some
undocumented features have also been removed.
+* Beautiful Soup will issue a warning if instead of markup you pass it
+ a URL or the name of a file on disk (a common beginner mistake).
+
= 4.2.1 (20130531) =
* The default XML formatter will now replace ampersands even if they
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 7b5964a..0dded3a 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -17,12 +17,13 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.2.1"
+__version__ = "4.3.0"
__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
__license__ = "MIT"
__all__ = ['BeautifulSoup']
+import os
import re
import warnings
@@ -162,6 +163,22 @@ class BeautifulSoup(Tag):
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
+ elif len(markup) <= 256:
+ # Print out warnings for a couple beginner problems
+ # involving passing non-markup to Beautiful Soup.
+ # Beautiful Soup will still parse the input as markup,
+ # just in case that's what the user really wants.
+ if os.path.exists(markup):
+ warnings.warn(
+ '"%s" looks like a filename, not markup. You should probably open a filehandle and pass the filehandle into Beautiful Soup.' % markup)
+ if markup[:5] == "http:" or markup[:6] == "https:":
+ # TODO: This is ugly but I couldn't get it to work in
+ # Python 3 otherwise.
+ if ((isinstance(markup, bytes) and not b' ' in markup)
+ or (isinstance(markup, unicode) and not u' ' in markup)):
+ warnings.warn(
+ '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
+
for (self.markup, self.original_encoding, self.declared_html_encoding,
self.contains_replacement_characters) in (
self.builder.prepare_markup(markup, from_encoding)):
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 0b69318..860b17b 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -4,6 +4,8 @@
import logging
import unittest
import sys
+import tempfile
+
from bs4 import (
BeautifulSoup,
BeautifulStoneSoup,
@@ -64,7 +66,31 @@ class TestDeprecatedConstructorArguments(SoupTest):
with warnings.catch_warnings(record=True) as w:
soup = BeautifulStoneSoup("<markup>")
self.assertTrue(isinstance(soup, BeautifulSoup))
- self.assertTrue("BeautifulStoneSoup class is deprecated")
+ self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
+
+class TestWarnings(SoupTest):
+
+ def test_disk_file_warning(self):
+ filehandle = tempfile.NamedTemporaryFile()
+ filename = filehandle.name
+ try:
+ with warnings.catch_warnings(record=True) as w:
+ soup = self.soup(filename)
+ msg = str(w[0].message)
+ self.assertTrue("looks like a filename" in msg)
+ finally:
+ filehandle.close()
+
+ def test_url_warning(self):
+ with warnings.catch_warnings(record=True) as w:
+ soup = self.soup("http://www.crummy.com/")
+ msg = str(w[0].message)
+ self.assertTrue("looks like a URL" in msg)
+
+ with warnings.catch_warnings(record=True) as w:
+ soup = self.soup("http://www.crummy.com/ is great")
+ self.assertEqual(0, len(w))
+
class TestSelectiveParsing(SoupTest):
diff --git a/setup.py b/setup.py
index 96457cd..3d3db4d 100644
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,7 @@ except ImportError:
from distutils.command.build_py import build_py
setup(name="beautifulsoup4",
- version = "4.2.1",
+ version = "4.3.0",
author="Leonard Richardson",
author_email='leonardr@segfault.org',
url="http://www.crummy.com/software/BeautifulSoup/bs4/",