diff options
author | Leonard Richardson <leonardr@segfault.org> | 2021-09-07 20:09:32 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2021-09-07 20:09:32 -0400 |
commit | 9d68e443978afda17f59f0ff9e73af2b9b0921c2 (patch) | |
tree | c23b00ad1379e3c10212c048ef84fc40c9321da3 /bs4/__init__.py | |
parent | 70f546b1e689a70e2f103795efce6d261a3dadf7 (diff) |
Goodbye, Python 2. [bug=1942919]
Diffstat (limited to 'bs4/__init__.py')
-rw-r--r-- | bs4/__init__.py | 46 |
1 files changed, 24 insertions, 22 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index 7c6044a..2a436d3 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -7,7 +7,7 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a provides methods and Pythonic idioms that make it easy to navigate, search, and modify the parse tree. -Beautiful Soup works with Python 2.7 and up. It works better if lxml +Beautiful Soup works with Python 3.5 and up. It works better if lxml and/or html5lib is installed. For more than you ever wanted to know about Beautiful Soup, see the @@ -15,13 +15,14 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.9.3" -__copyright__ = "Copyright (c) 2004-2020 Leonard Richardson" +__version__ = "4.10.0" +__copyright__ = "Copyright (c) 2004-2021 Leonard Richardson" # Use of this source code is governed by the MIT license. __license__ = "MIT" __all__ = ['BeautifulSoup'] + from collections import Counter import os import re @@ -29,6 +30,11 @@ import sys import traceback import warnings +# The very first thing we do is give a useful error if someone is +# running this code under Python 2. +if sys.version_info.major < 3: + raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.') + from .builder import builder_registry, ParserRejectedMarkup from .dammit import UnicodeDammit from .element import ( @@ -49,10 +55,6 @@ from .element import ( TemplateString, ) -# The very first thing we do is give a useful error if someone is -# running this code under Python 3 without converting it. -'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' - # Define some custom warnings. class GuessedAtParserWarning(UserWarning): """The warning issued when BeautifulSoup has to guess what parser to @@ -100,7 +102,7 @@ class BeautifulSoup(Tag): # Since BeautifulSoup subclasses Tag, it's possible to treat it as # a Tag with a .name. This name makes it clear the BeautifulSoup # object isn't a real markup tag. - ROOT_TAG_NAME = u'[document]' + ROOT_TAG_NAME = '[document]' # If the end-user gives no indication which tree builder they # want, look for one with these features. @@ -217,7 +219,7 @@ class BeautifulSoup(Tag): from_encoding = from_encoding or deprecated_argument( "fromEncoding", "from_encoding") - if from_encoding and isinstance(markup, unicode): + if from_encoding and isinstance(markup, str): warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") from_encoding = None @@ -234,7 +236,7 @@ class BeautifulSoup(Tag): builder_class = builder builder = None elif builder is None: - if isinstance(features, basestring): + if isinstance(features, str): features = [features] if features is None or len(features) == 0: features = self.DEFAULT_BUILDER_FEATURES @@ -309,13 +311,13 @@ class BeautifulSoup(Tag): markup = markup.read() elif len(markup) <= 256 and ( (isinstance(markup, bytes) and not b'<' in markup) - or (isinstance(markup, unicode) and not u'<' in markup) + or (isinstance(markup, str) and not '<' in markup) ): # Print out warnings for a couple beginner problems # involving passing non-markup to Beautiful Soup. # Beautiful Soup will still parse the input as markup, # just in case that's what the user really wants. - if (isinstance(markup, unicode) + if (isinstance(markup, str) and not os.path.supports_unicode_filenames): possible_filename = markup.encode("utf8") else: @@ -326,7 +328,7 @@ class BeautifulSoup(Tag): is_file = os.path.exists(possible_filename) if is_file: is_directory = os.path.isdir(possible_filename) - except Exception, e: + except Exception as e: # This is almost certainly a problem involving # characters not valid in filenames on this # system. Just let it go. @@ -365,9 +367,9 @@ class BeautifulSoup(Tag): pass if not success: - other_exceptions = [unicode(e) for e in rejections] + other_exceptions = [str(e) for e in rejections] raise ParserRejectedMarkup( - u"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) + "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) ) # Clear out the markup and remove the builder's circular @@ -418,9 +420,9 @@ class BeautifulSoup(Tag): if isinstance(markup, bytes): space = b' ' cant_start_with = (b"http:", b"https:") - elif isinstance(markup, unicode): - space = u' ' - cant_start_with = (u"http:", u"https:") + elif isinstance(markup, str): + space = ' ' + cant_start_with = ("http:", "https:") else: return @@ -555,7 +557,7 @@ class BeautifulSoup(Tag): occurs. """ if self.current_data: - current_data = u''.join(self.current_data) + current_data = ''.join(self.current_data) # If whitespace is not preserved, and this string contains # nothing but ASCII spaces, replace it with a single space # or newline. @@ -759,9 +761,9 @@ class BeautifulSoup(Tag): eventual_encoding = None if eventual_encoding != None: encoding_part = ' encoding="%s"' % eventual_encoding - prefix = u'<?xml version="1.0"%s?>\n' % encoding_part + prefix = '<?xml version="1.0"%s?>\n' % encoding_part else: - prefix = u'' + prefix = '' if not pretty_print: indent_level = None else: @@ -799,4 +801,4 @@ class FeatureNotFound(ValueError): if __name__ == '__main__': import sys soup = BeautifulSoup(sys.stdin) - print(soup.prettify()) + print((soup.prettify())) |