diff options
Diffstat (limited to 'bs4/element.py')
-rw-r--r-- | bs4/element.py | 110 |
1 files changed, 55 insertions, 55 deletions
diff --git a/bs4/element.py b/bs4/element.py index e7867a9..82a986e 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -3,14 +3,14 @@ __license__ = "MIT" try: from collections.abc import Callable # Python 3.6 -except ImportError , e: +except ImportError as e: from collections import Callable import re import sys import warnings try: import soupsieve -except ImportError, e: +except ImportError as e: soupsieve = None warnings.warn( 'The soupsieve package is not installed. CSS selectors cannot be used.' @@ -57,22 +57,22 @@ def _alias(attr): # Source: # https://docs.python.org/3/library/codecs.html#python-specific-encodings PYTHON_SPECIFIC_ENCODINGS = set([ - u"idna", - u"mbcs", - u"oem", - u"palmos", - u"punycode", - u"raw_unicode_escape", - u"undefined", - u"unicode_escape", - u"raw-unicode-escape", - u"unicode-escape", - u"string-escape", - u"string_escape", + "idna", + "mbcs", + "oem", + "palmos", + "punycode", + "raw_unicode_escape", + "undefined", + "unicode_escape", + "raw-unicode-escape", + "unicode-escape", + "string-escape", + "string_escape", ]) -class NamespacedAttribute(unicode): +class NamespacedAttribute(str): """A namespaced string (e.g. 'xml:lang') that remembers the namespace ('xml') and the name ('lang') that were used to create it. """ @@ -84,18 +84,18 @@ class NamespacedAttribute(unicode): name = None if not name: - obj = unicode.__new__(cls, prefix) + obj = str.__new__(cls, prefix) elif not prefix: # Not really namespaced. - obj = unicode.__new__(cls, name) + obj = str.__new__(cls, name) else: - obj = unicode.__new__(cls, prefix + ":" + name) + obj = str.__new__(cls, prefix + ":" + name) obj.prefix = prefix obj.name = name obj.namespace = namespace return obj -class AttributeValueWithCharsetSubstitution(unicode): +class AttributeValueWithCharsetSubstitution(str): """A stand-in object for a character encoding specified in HTML.""" class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): @@ -106,7 +106,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): """ def __new__(cls, original_value): - obj = unicode.__new__(cls, original_value) + obj = str.__new__(cls, original_value) obj.original_value = original_value return obj @@ -134,9 +134,9 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): match = cls.CHARSET_RE.search(original_value) if match is None: # No substitution necessary. - return unicode.__new__(unicode, original_value) + return str.__new__(str, original_value) - obj = unicode.__new__(cls, original_value) + obj = str.__new__(cls, original_value) obj.original_value = original_value return obj @@ -272,7 +272,7 @@ class PageElement(object): for string in self._all_strings(True): yield string - def get_text(self, separator=u"", strip=False, + def get_text(self, separator="", strip=False, types=default): """Get all child strings of this PageElement, concatenated using the given separator. @@ -418,7 +418,7 @@ class PageElement(object): raise ValueError("Cannot insert None into a tag.") if new_child is self: raise ValueError("Cannot insert a tag into itself.") - if (isinstance(new_child, basestring) + if (isinstance(new_child, str) and not isinstance(new_child, NavigableString)): new_child = NavigableString(new_child) @@ -795,7 +795,7 @@ class PageElement(object): result = (element for element in generator if isinstance(element, Tag)) return ResultSet(strainer, result) - elif isinstance(name, basestring): + elif isinstance(name, str): # Optimization to find all tags with a given name. if name.count(':') == 1: # This is a name with a prefix. If this is a namespace-aware document, @@ -914,7 +914,7 @@ class PageElement(object): return self.parents -class NavigableString(unicode, PageElement): +class NavigableString(str, PageElement): """A Python Unicode string that is part of a parse tree. When Beautiful Soup parses the markup <b>penguin</b>, it will @@ -937,10 +937,10 @@ class NavigableString(unicode, PageElement): passed in to the superclass's __new__ or the superclass won't know how to handle non-ASCII characters. """ - if isinstance(value, unicode): - u = unicode.__new__(cls, value) + if isinstance(value, str): + u = str.__new__(cls, value) else: - u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) u.setup() return u @@ -951,7 +951,7 @@ class NavigableString(unicode, PageElement): return type(self)(self) def __getnewargs__(self): - return (unicode(self),) + return (str(self),) def __getattr__(self, attr): """text.string gives you text. This is for backwards @@ -1059,30 +1059,30 @@ class PreformattedString(NavigableString): class CData(PreformattedString): """A CDATA block.""" - PREFIX = u'<![CDATA[' - SUFFIX = u']]>' + PREFIX = '<![CDATA[' + SUFFIX = ']]>' class ProcessingInstruction(PreformattedString): """A SGML processing instruction.""" - PREFIX = u'<?' - SUFFIX = u'>' + PREFIX = '<?' + SUFFIX = '>' class XMLProcessingInstruction(ProcessingInstruction): """An XML processing instruction.""" - PREFIX = u'<?' - SUFFIX = u'?>' + PREFIX = '<?' + SUFFIX = '?>' class Comment(PreformattedString): """An HTML or XML comment.""" - PREFIX = u'<!--' - SUFFIX = u'-->' + PREFIX = '<!--' + SUFFIX = '-->' class Declaration(PreformattedString): """An XML declaration.""" - PREFIX = u'<?' - SUFFIX = u'?>' + PREFIX = '<?' + SUFFIX = '?>' class Doctype(PreformattedString): @@ -1110,8 +1110,8 @@ class Doctype(PreformattedString): return Doctype(value) - PREFIX = u'<!DOCTYPE ' - SUFFIX = u'>\n' + PREFIX = '<!DOCTYPE ' + SUFFIX = '>\n' class Stylesheet(NavigableString): @@ -1496,7 +1496,7 @@ class Tag(PageElement): def __contains__(self, x): return x in self.contents - def __nonzero__(self): + def __bool__(self): "A tag is non-None even if it has no contents." return True @@ -1645,8 +1645,8 @@ class Tag(PageElement): else: if isinstance(val, list) or isinstance(val, tuple): val = ' '.join(val) - elif not isinstance(val, basestring): - val = unicode(val) + elif not isinstance(val, str): + val = str(val) elif ( isinstance(val, AttributeValueWithCharsetSubstitution) and eventual_encoding is not None @@ -1655,7 +1655,7 @@ class Tag(PageElement): text = formatter.attribute_value(val) decoded = ( - unicode(key) + '=' + str(key) + '=' + formatter.quoted_attribute_value(text)) attrs.append(decoded) close = '' @@ -2014,7 +2014,7 @@ class SoupStrainer(object): else: attrs = kwargs normalized_attrs = {} - for key, value in attrs.items(): + for key, value in list(attrs.items()): normalized_attrs[key] = self._normalize_search_value(value) self.attrs = normalized_attrs @@ -2023,7 +2023,7 @@ class SoupStrainer(object): def _normalize_search_value(self, value): # Leave it alone if it's a Unicode string, a callable, a # regular expression, a boolean, or None. - if (isinstance(value, unicode) or isinstance(value, Callable) or hasattr(value, 'match') + if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') or isinstance(value, bool) or value is None): return value @@ -2036,7 +2036,7 @@ class SoupStrainer(object): new_value = [] for v in value: if (hasattr(v, '__iter__') and not isinstance(v, bytes) - and not isinstance(v, unicode)): + and not isinstance(v, str)): # This is almost certainly the user's mistake. In the # interests of avoiding infinite loops, we'll let # it through as-is rather than doing a recursive call. @@ -2048,7 +2048,7 @@ class SoupStrainer(object): # Otherwise, convert it into a Unicode string. # The unicode(str()) thing is so this will do the same thing on Python 2 # and Python 3. - return unicode(str(value)) + return str(str(value)) def __str__(self): """A human-readable representation of this SoupStrainer.""" @@ -2076,7 +2076,7 @@ class SoupStrainer(object): markup = markup_name markup_attrs = markup - if isinstance(self.name, basestring): + if isinstance(self.name, str): # Optimization for a very common case where the user is # searching for a tag with one specific name, and we're # looking at a tag with a different name. @@ -2132,7 +2132,7 @@ class SoupStrainer(object): found = None # If given a list of items, scan it for a text element that # matches. - if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): + if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): for element in markup: if isinstance(element, NavigableString) \ and self.search(element): @@ -2145,7 +2145,7 @@ class SoupStrainer(object): found = self.search_tag(markup) # If it's text, make sure the text matches. elif isinstance(markup, NavigableString) or \ - isinstance(markup, basestring): + isinstance(markup, str): if not self.name and not self.attrs and self._matches(markup, self.text): found = markup else: @@ -2190,7 +2190,7 @@ class SoupStrainer(object): return not match_against if (hasattr(match_against, '__iter__') - and not isinstance(match_against, basestring)): + and not isinstance(match_against, str)): # We're asked to match against an iterable of items. # The markup must be match at least one item in the # iterable. We'll try each one in turn. @@ -2217,7 +2217,7 @@ class SoupStrainer(object): # the tag's name and once against its prefixed name. match = False - if not match and isinstance(match_against, unicode): + if not match and isinstance(match_against, str): # Exact string match match = markup == match_against |