diff options
Diffstat (limited to 'bs4/builder')
-rw-r--r-- | bs4/builder/__init__.py | 6 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 20 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 18 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 26 |
4 files changed, 35 insertions, 35 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index b6e2c37..bd44905 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -301,13 +301,13 @@ class TreeBuilder(object): universal = self.cdata_list_attributes.get('*', []) tag_specific = self.cdata_list_attributes.get( tag_name.lower(), None) - for attr in attrs.keys(): + for attr in list(attrs.keys()): if attr in universal or (tag_specific and attr in tag_specific): # We have a "class"-type attribute whose string # value is a whitespace-separated list of # values. Split it into a list. value = attrs[attr] - if isinstance(value, basestring): + if isinstance(value, str): values = nonwhitespace_re.findall(value) else: # html5lib sometimes calls setAttributes twice @@ -497,7 +497,7 @@ class ParserRejectedMarkup(Exception): """ if isinstance(message_or_exception, Exception): e = message_or_exception - message_or_exception = "%s: %s" % (e.__class__.__name__, unicode(e)) + message_or_exception = "%s: %s" % (e.__class__.__name__, str(e)) super(ParserRejectedMarkup, self).__init__(message_or_exception) # Builders are registered in reverse order of priority, so that custom diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index a1c6134..69aefd7 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -33,7 +33,7 @@ try: # Pre-0.99999999 from html5lib.treebuilders import _base as treebuilder_base new_html5lib = False -except ImportError, e: +except ImportError as e: # 0.99999999 and up from html5lib.treebuilders import base as treebuilder_base new_html5lib = True @@ -79,7 +79,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): parser = html5lib.HTMLParser(tree=self.create_treebuilder) self.underlying_builder.parser = parser extra_kwargs = dict() - if not isinstance(markup, unicode): + if not isinstance(markup, str): if new_html5lib: extra_kwargs['override_encoding'] = self.user_specified_encoding else: @@ -87,13 +87,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder): doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. - if isinstance(markup, unicode): + if isinstance(markup, str): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: original_encoding = parser.tokenizer.stream.charEncoding[0] - if not isinstance(original_encoding, basestring): + if not isinstance(original_encoding, str): # In 0.99999999 and up, the encoding is an html5lib # Encoding object. We want to use a string for compatibility # with other tree builders. @@ -110,7 +110,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'<html><head></head><body>%s</body></html>' % fragment + return '<html><head></head><body>%s</body></html>' % fragment class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): @@ -217,7 +217,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): rv.append("|%s<%s>" % (' ' * indent, name)) if element.attrs: attributes = [] - for name, value in element.attrs.items(): + for name, value in list(element.attrs.items()): if isinstance(name, NamespacedAttribute): name = "%s %s" % (prefixes[name.namespace], name.name) if isinstance(value, list): @@ -272,7 +272,7 @@ class Element(treebuilder_base.Node): def appendChild(self, node): string_child = child = None - if isinstance(node, basestring): + if isinstance(node, str): # Some other piece of code decided to pass in a string # instead of creating a TextElement object to contain the # string. @@ -289,7 +289,7 @@ class Element(treebuilder_base.Node): child = node.element node.parent = self - if not isinstance(child, basestring) and child.parent is not None: + if not isinstance(child, str) and child.parent is not None: node.element.extract() if (string_child is not None and self.element.contents @@ -302,7 +302,7 @@ class Element(treebuilder_base.Node): old_element.replace_with(new_element) self.soup._most_recent_element = new_element else: - if isinstance(node, basestring): + if isinstance(node, str): # Create a brand new NavigableString from this string. child = self.soup.new_string(node) @@ -340,7 +340,7 @@ class Element(treebuilder_base.Node): self.soup.builder._replace_cdata_list_attribute_values( self.name, attributes) - for name, value in attributes.items(): + for name, value in list(attributes.items()): self.element[name] = value # The attributes may contain variables that need substitution. diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 355be11..70e9be8 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -8,11 +8,11 @@ __all__ = [ 'HTMLParserTreeBuilder', ] -from HTMLParser import HTMLParser +from html.parser import HTMLParser try: - from HTMLParser import HTMLParseError -except ImportError, e: + from html.parser import HTMLParseError +except ImportError as e: # HTMLParseError is removed in Python 3.5. Since it can never be # thrown in 3.5, we can just define our own class as a placeholder. class HTMLParseError(Exception): @@ -219,14 +219,14 @@ class BeautifulSoupHTMLParser(HTMLParser): continue try: data = bytearray([real_name]).decode(encoding) - except UnicodeDecodeError, e: + except UnicodeDecodeError as e: pass if not data: try: - data = unichr(real_name) - except (ValueError, OverflowError), e: + data = chr(real_name) + except (ValueError, OverflowError) as e: pass - data = data or u"\N{REPLACEMENT CHARACTER}" + data = data or "\N{REPLACEMENT CHARACTER}" self.handle_data(data) def handle_entityref(self, name): @@ -353,7 +353,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): document to Unicode and parsing it. Each strategy will be tried in turn. """ - if isinstance(markup, unicode): + if isinstance(markup, str): # Parse Unicode as-is. yield (markup, None, None, False) return @@ -391,7 +391,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): try: parser.feed(markup) parser.close() - except HTMLParseError, e: + except HTMLParseError as e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index c670b84..11c9a69 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -8,11 +8,11 @@ __all__ = [ try: from collections.abc import Callable # Python 3.6 -except ImportError , e: +except ImportError as e: from collections import Callable from io import BytesIO -from StringIO import StringIO +from io import StringIO from lxml import etree from bs4.element import ( Comment, @@ -35,7 +35,7 @@ LXML = 'lxml' def _invert(d): "Invert a dictionary." - return dict((v,k) for k, v in d.items()) + return dict((v,k) for k, v in list(d.items())) class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser @@ -81,7 +81,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): :param mapping: A dictionary mapping namespace prefixes to URIs. """ - for key, value in mapping.items(): + for key, value in list(mapping.items()): if key and key not in self.soup._namespaces: # Let the BeautifulSoup object know about a new namespace. # If there are multiple namespaces defined with the same @@ -169,12 +169,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): else: self.processing_instruction_class = XMLProcessingInstruction - if isinstance(markup, unicode): + if isinstance(markup, str): # We were given Unicode. Maybe lxml can parse Unicode on # this system? yield markup, None, document_declared_encoding, False - if isinstance(markup, unicode): + if isinstance(markup, str): # No, apparently not. Convert the Unicode to UTF-8 and # tell lxml to parse it as UTF-8. yield (markup.encode("utf8"), "utf8", @@ -199,7 +199,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): def feed(self, markup): if isinstance(markup, bytes): markup = BytesIO(markup) - elif isinstance(markup, unicode): + elif isinstance(markup, str): markup = StringIO(markup) # Call feed() at least once, even if the markup is empty, @@ -214,7 +214,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): if len(data) != 0: self.parser.feed(data) self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: raise ParserRejectedMarkup(e) def close(self): @@ -243,7 +243,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): # Also treat the namespace mapping as a set of attributes on the # tag, so we can recreate it later. attrs = attrs.copy() - for prefix, namespace in nsmap.items(): + for prefix, namespace in list(nsmap.items()): attribute = NamespacedAttribute( "xmlns", prefix, "http://www.w3.org/2000/xmlns/") attrs[attribute] = namespace @@ -252,7 +252,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): # from lxml with namespaces attached to their names, and # turn then into NamespacedAttribute objects. new_attrs = {} - for attr, value in attrs.items(): + for attr, value in list(attrs.items()): namespace, attr = self._getNsTag(attr) if namespace is None: new_attrs[attr] = value @@ -312,7 +312,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment + return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): @@ -333,10 +333,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): self.parser = self.parser_for(encoding) self.parser.feed(markup) self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: raise ParserRejectedMarkup(e) def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'<html><body>%s</body></html>' % fragment + return '<html><body>%s</body></html>' % fragment |