diff options
author | Leonard Richardson <leonardr@segfault.org> | 2016-07-16 11:59:37 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2016-07-16 11:59:37 -0400 |
commit | adcfa8e5ec199c41f5b22041dbfeb852aa034434 (patch) | |
tree | 1ff613481be6acb6df13474117cb9932cdf6c623 /bs4/__init__.py | |
parent | e4ff05f0783605350171f6623d4055837c2af14f (diff) |
Fixed a Python 3 ByteWarning when a URL was passed in as though it
were markup. Thanks to James Salter for a patch and
test. [bug=1533762]
Diffstat (limited to 'bs4/__init__.py')
-rw-r--r-- | bs4/__init__.py | 42 |
1 files changed, 32 insertions, 10 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index efcc457..688378a 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -198,16 +198,10 @@ class BeautifulSoup(Tag): if isinstance(markup, unicode): markup = markup.encode("utf8") warnings.warn( - '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) - if markup[:5] == "http:" or markup[:6] == "https:": - # TODO: This is ugly but I couldn't get it to work in - # Python 3 otherwise. - if ((isinstance(markup, bytes) and not b' ' in markup) - or (isinstance(markup, unicode) and not u' ' in markup)): - if isinstance(markup, unicode): - markup = markup.encode("utf8") - warnings.warn( - '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) + '"%s" looks like a filename, not markup. You should' + 'probably open this file and pass the filehandle into' + 'Beautiful Soup.' % markup) + self._check_markup_is_url(markup) for (self.markup, self.original_encoding, self.declared_html_encoding, self.contains_replacement_characters) in ( @@ -235,6 +229,34 @@ class BeautifulSoup(Tag): del d['builder'] return d + @staticmethod + def _check_markup_is_url(markup): + """ + Check if markup looks like it's actually a url and raise a warning + if so. Markup can be unicode or str (py2) / bytes (py3). + """ + if isinstance(markup, bytes): + space = b' ' + cant_start_with = (b"http:", b"https:") + elif isinstance(markup, unicode): + space = u' ' + cant_start_with = (u"http:", u"https:") + else: + return + + if any(markup.startswith(prefix) for prefix in cant_start_with): + if not space in markup: + if isinstance(markup, bytes): + decoded_markup = markup.decode('utf-8', 'replace') + else: + decoded_markup = markup + warnings.warn( + '"%s" looks like a URL. Beautiful Soup is not an' + ' HTTP client. You should probably use an HTTP client like' + ' requests to get the document behind the URL, and feed' + ' that document to Beautiful Soup.' % decoded_markup + ) + def _feed(self): # Convert the document to Unicode. self.builder.reset() |