Fixed a Python 3 ByteWarning when a URL was passed in as though it

were markup. Thanks to James Salter for a patch and test. [bug=1533762]
author: Leonard Richardson <leonardr@segfault.org> 2016-07-16 11:59:37 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2016-07-16 11:59:37 -0400
commit: adcfa8e5ec199c41f5b22041dbfeb852aa034434 (patch)
tree: 1ff613481be6acb6df13474117cb9932cdf6c623
parent: e4ff05f0783605350171f6623d4055837c2af14f (diff)
3 files changed, 67 insertions, 18 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 3726c57..691cf1f 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,3 +1,11 @@
+= 4.5.0 () =
+
+* Corrected handling of XML processing instructions. [bug=1504393]
+
+* Fixed a Python 3 ByteWarning when a URL was passed in as though it
+  were markup. Thanks to James Salter for a patch and
+  test. [bug=1533762]
+
 = 4.4.1 (20150928) =
 
 * Fixed a bug that deranged the tree when part of it was
diff --git a/bs4/__init__.py b/bs4/__init__.py
index efcc457..688378a 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -198,16 +198,10 @@ class BeautifulSoup(Tag):
                 if isinstance(markup, unicode):
                     markup = markup.encode("utf8")
                 warnings.warn(
-                    '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
-            if markup[:5] == "http:" or markup[:6] == "https:":
-                # TODO: This is ugly but I couldn't get it to work in
-                # Python 3 otherwise.
-                if ((isinstance(markup, bytes) and not b' ' in markup)
-                    or (isinstance(markup, unicode) and not u' ' in markup)):
-                    if isinstance(markup, unicode):
-                        markup = markup.encode("utf8")
-                    warnings.warn(
-                        '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
+                    '"%s" looks like a filename, not markup. You should'
+                    'probably open this file and pass the filehandle into'
+                    'Beautiful Soup.' % markup)
+            self._check_markup_is_url(markup)
 
         for (self.markup, self.original_encoding, self.declared_html_encoding,
          self.contains_replacement_characters) in (
@@ -235,6 +229,34 @@ class BeautifulSoup(Tag):
             del d['builder']
         return d
 
+    @staticmethod
+    def _check_markup_is_url(markup):
+        """ 
+        Check if markup looks like it's actually a url and raise a warning 
+        if so. Markup can be unicode or str (py2) / bytes (py3).
+        """
+        if isinstance(markup, bytes):
+            space = b' '
+            cant_start_with = (b"http:", b"https:")
+        elif isinstance(markup, unicode):
+            space = u' '
+            cant_start_with = (u"http:", u"https:")
+        else:
+            return
+
+        if any(markup.startswith(prefix) for prefix in cant_start_with):
+            if not space in markup:
+                if isinstance(markup, bytes):
+                    decoded_markup = markup.decode('utf-8', 'replace')
+                else:
+                    decoded_markup = markup
+                warnings.warn(
+                    '"%s" looks like a URL. Beautiful Soup is not an'
+                    ' HTTP client. You should probably use an HTTP client like'
+                    ' requests to get the document behind the URL, and feed'
+                    ' that document to Beautiful Soup.' % decoded_markup
+                )
+
     def _feed(self):
         # Convert the document to Unicode.
         self.builder.reset()
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 1238af2..e1c2f3d 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -118,15 +118,34 @@ class TestWarnings(SoupTest):
             soup = self.soup(filename)
         self.assertEqual(0, len(w))
 
-    def test_url_warning(self):
-        with warnings.catch_warnings(record=True) as w:
-            soup = self.soup("http://www.crummy.com/")
-        msg = str(w[0].message)
-        self.assertTrue("looks like a URL" in msg)
+    def test_url_warning_with_bytes_url(self):
+        with warnings.catch_warnings(record=True) as warning_list:
+            soup = self.soup(b"http://www.crummybytes.com/")
+        # Be aware this isn't the only warning that can be raised during
+        # execution..
+        self.assertTrue(any("looks like a URL" in str(w.message) 
+            for w in warning_list))
+
+    def test_url_warning_with_unicode_url(self):
+        with warnings.catch_warnings(record=True) as warning_list:
+            # note - this url must differ from the bytes one otherwise
+            # python's warnings system swallows the second warning
+            soup = self.soup(u"http://www.crummyunicode.com/")
+        self.assertTrue(any("looks like a URL" in str(w.message) 
+            for w in warning_list))
+
+    def test_url_warning_with_bytes_and_space(self):
+        with warnings.catch_warnings(record=True) as warning_list:
+            soup = self.soup(b"http://www.crummybytes.com/ is great")
+        self.assertFalse(any("looks like a URL" in str(w.message) 
+            for w in warning_list))
+
+    def test_url_warning_with_unicode_and_space(self):
+        with warnings.catch_warnings(record=True) as warning_list:
+            soup = self.soup(u"http://www.crummyuncode.com/ is great")
+        self.assertFalse(any("looks like a URL" in str(w.message) 
+            for w in warning_list))
 
-        with warnings.catch_warnings(record=True) as w:
-            soup = self.soup("http://www.crummy.com/ is great")
-        self.assertEqual(0, len(w))
 
 class TestSelectiveParsing(SoupTest):
author	Leonard Richardson <leonardr@segfault.org>	2016-07-16 11:59:37 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2016-07-16 11:59:37 -0400
commit	adcfa8e5ec199c41f5b22041dbfeb852aa034434 (patch)
tree	1ff613481be6acb6df13474117cb9932cdf6c623
parent	e4ff05f0783605350171f6623d4055837c2af14f (diff)