summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt8
-rw-r--r--bs4/__init__.py42
-rw-r--r--bs4/tests/test_soup.py35
3 files changed, 67 insertions, 18 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 3726c57..691cf1f 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,3 +1,11 @@
+= 4.5.0 () =
+
+* Corrected handling of XML processing instructions. [bug=1504393]
+
+* Fixed a Python 3 ByteWarning when a URL was passed in as though it
+ were markup. Thanks to James Salter for a patch and
+ test. [bug=1533762]
+
= 4.4.1 (20150928) =
* Fixed a bug that deranged the tree when part of it was
diff --git a/bs4/__init__.py b/bs4/__init__.py
index efcc457..688378a 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -198,16 +198,10 @@ class BeautifulSoup(Tag):
if isinstance(markup, unicode):
markup = markup.encode("utf8")
warnings.warn(
- '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
- if markup[:5] == "http:" or markup[:6] == "https:":
- # TODO: This is ugly but I couldn't get it to work in
- # Python 3 otherwise.
- if ((isinstance(markup, bytes) and not b' ' in markup)
- or (isinstance(markup, unicode) and not u' ' in markup)):
- if isinstance(markup, unicode):
- markup = markup.encode("utf8")
- warnings.warn(
- '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
+ '"%s" looks like a filename, not markup. You should'
+ 'probably open this file and pass the filehandle into'
+ 'Beautiful Soup.' % markup)
+ self._check_markup_is_url(markup)
for (self.markup, self.original_encoding, self.declared_html_encoding,
self.contains_replacement_characters) in (
@@ -235,6 +229,34 @@ class BeautifulSoup(Tag):
del d['builder']
return d
+ @staticmethod
+ def _check_markup_is_url(markup):
+ """
+ Check if markup looks like it's actually a url and raise a warning
+ if so. Markup can be unicode or str (py2) / bytes (py3).
+ """
+ if isinstance(markup, bytes):
+ space = b' '
+ cant_start_with = (b"http:", b"https:")
+ elif isinstance(markup, unicode):
+ space = u' '
+ cant_start_with = (u"http:", u"https:")
+ else:
+ return
+
+ if any(markup.startswith(prefix) for prefix in cant_start_with):
+ if not space in markup:
+ if isinstance(markup, bytes):
+ decoded_markup = markup.decode('utf-8', 'replace')
+ else:
+ decoded_markup = markup
+ warnings.warn(
+ '"%s" looks like a URL. Beautiful Soup is not an'
+ ' HTTP client. You should probably use an HTTP client like'
+ ' requests to get the document behind the URL, and feed'
+ ' that document to Beautiful Soup.' % decoded_markup
+ )
+
def _feed(self):
# Convert the document to Unicode.
self.builder.reset()
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 1238af2..e1c2f3d 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -118,15 +118,34 @@ class TestWarnings(SoupTest):
soup = self.soup(filename)
self.assertEqual(0, len(w))
- def test_url_warning(self):
- with warnings.catch_warnings(record=True) as w:
- soup = self.soup("http://www.crummy.com/")
- msg = str(w[0].message)
- self.assertTrue("looks like a URL" in msg)
+ def test_url_warning_with_bytes_url(self):
+ with warnings.catch_warnings(record=True) as warning_list:
+ soup = self.soup(b"http://www.crummybytes.com/")
+ # Be aware this isn't the only warning that can be raised during
+ # execution..
+ self.assertTrue(any("looks like a URL" in str(w.message)
+ for w in warning_list))
+
+ def test_url_warning_with_unicode_url(self):
+ with warnings.catch_warnings(record=True) as warning_list:
+ # note - this url must differ from the bytes one otherwise
+ # python's warnings system swallows the second warning
+ soup = self.soup(u"http://www.crummyunicode.com/")
+ self.assertTrue(any("looks like a URL" in str(w.message)
+ for w in warning_list))
+
+ def test_url_warning_with_bytes_and_space(self):
+ with warnings.catch_warnings(record=True) as warning_list:
+ soup = self.soup(b"http://www.crummybytes.com/ is great")
+ self.assertFalse(any("looks like a URL" in str(w.message)
+ for w in warning_list))
+
+ def test_url_warning_with_unicode_and_space(self):
+ with warnings.catch_warnings(record=True) as warning_list:
+ soup = self.soup(u"http://www.crummyuncode.com/ is great")
+ self.assertFalse(any("looks like a URL" in str(w.message)
+ for w in warning_list))
- with warnings.catch_warnings(record=True) as w:
- soup = self.soup("http://www.crummy.com/ is great")
- self.assertEqual(0, len(w))
class TestSelectiveParsing(SoupTest):