diff options
author | Leonard Richardson <leonardr@segfault.org> | 2016-07-16 11:59:37 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2016-07-16 11:59:37 -0400 |
commit | adcfa8e5ec199c41f5b22041dbfeb852aa034434 (patch) | |
tree | 1ff613481be6acb6df13474117cb9932cdf6c623 | |
parent | e4ff05f0783605350171f6623d4055837c2af14f (diff) |
Fixed a Python 3 ByteWarning when a URL was passed in as though it
were markup. Thanks to James Salter for a patch and
test. [bug=1533762]
-rw-r--r-- | NEWS.txt | 8 | ||||
-rw-r--r-- | bs4/__init__.py | 42 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 35 |
3 files changed, 67 insertions, 18 deletions
@@ -1,3 +1,11 @@ += 4.5.0 () = + +* Corrected handling of XML processing instructions. [bug=1504393] + +* Fixed a Python 3 ByteWarning when a URL was passed in as though it + were markup. Thanks to James Salter for a patch and + test. [bug=1533762] + = 4.4.1 (20150928) = * Fixed a bug that deranged the tree when part of it was diff --git a/bs4/__init__.py b/bs4/__init__.py index efcc457..688378a 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -198,16 +198,10 @@ class BeautifulSoup(Tag): if isinstance(markup, unicode): markup = markup.encode("utf8") warnings.warn( - '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) - if markup[:5] == "http:" or markup[:6] == "https:": - # TODO: This is ugly but I couldn't get it to work in - # Python 3 otherwise. - if ((isinstance(markup, bytes) and not b' ' in markup) - or (isinstance(markup, unicode) and not u' ' in markup)): - if isinstance(markup, unicode): - markup = markup.encode("utf8") - warnings.warn( - '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) + '"%s" looks like a filename, not markup. You should' + 'probably open this file and pass the filehandle into' + 'Beautiful Soup.' % markup) + self._check_markup_is_url(markup) for (self.markup, self.original_encoding, self.declared_html_encoding, self.contains_replacement_characters) in ( @@ -235,6 +229,34 @@ class BeautifulSoup(Tag): del d['builder'] return d + @staticmethod + def _check_markup_is_url(markup): + """ + Check if markup looks like it's actually a url and raise a warning + if so. Markup can be unicode or str (py2) / bytes (py3). + """ + if isinstance(markup, bytes): + space = b' ' + cant_start_with = (b"http:", b"https:") + elif isinstance(markup, unicode): + space = u' ' + cant_start_with = (u"http:", u"https:") + else: + return + + if any(markup.startswith(prefix) for prefix in cant_start_with): + if not space in markup: + if isinstance(markup, bytes): + decoded_markup = markup.decode('utf-8', 'replace') + else: + decoded_markup = markup + warnings.warn( + '"%s" looks like a URL. Beautiful Soup is not an' + ' HTTP client. You should probably use an HTTP client like' + ' requests to get the document behind the URL, and feed' + ' that document to Beautiful Soup.' % decoded_markup + ) + def _feed(self): # Convert the document to Unicode. self.builder.reset() diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 1238af2..e1c2f3d 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -118,15 +118,34 @@ class TestWarnings(SoupTest): soup = self.soup(filename) self.assertEqual(0, len(w)) - def test_url_warning(self): - with warnings.catch_warnings(record=True) as w: - soup = self.soup("http://www.crummy.com/") - msg = str(w[0].message) - self.assertTrue("looks like a URL" in msg) + def test_url_warning_with_bytes_url(self): + with warnings.catch_warnings(record=True) as warning_list: + soup = self.soup(b"http://www.crummybytes.com/") + # Be aware this isn't the only warning that can be raised during + # execution.. + self.assertTrue(any("looks like a URL" in str(w.message) + for w in warning_list)) + + def test_url_warning_with_unicode_url(self): + with warnings.catch_warnings(record=True) as warning_list: + # note - this url must differ from the bytes one otherwise + # python's warnings system swallows the second warning + soup = self.soup(u"http://www.crummyunicode.com/") + self.assertTrue(any("looks like a URL" in str(w.message) + for w in warning_list)) + + def test_url_warning_with_bytes_and_space(self): + with warnings.catch_warnings(record=True) as warning_list: + soup = self.soup(b"http://www.crummybytes.com/ is great") + self.assertFalse(any("looks like a URL" in str(w.message) + for w in warning_list)) + + def test_url_warning_with_unicode_and_space(self): + with warnings.catch_warnings(record=True) as warning_list: + soup = self.soup(u"http://www.crummyuncode.com/ is great") + self.assertFalse(any("looks like a URL" in str(w.message) + for w in warning_list)) - with warnings.catch_warnings(record=True) as w: - soup = self.soup("http://www.crummy.com/ is great") - self.assertEqual(0, len(w)) class TestSelectiveParsing(SoupTest): |