summaryrefslogtreecommitdiff
path: root/bs4/builder
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/builder')
-rw-r--r--bs4/builder/__init__.py6
-rw-r--r--bs4/builder/_html5lib.py20
-rw-r--r--bs4/builder/_htmlparser.py18
-rw-r--r--bs4/builder/_lxml.py26
4 files changed, 35 insertions, 35 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index b6e2c37..bd44905 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -301,13 +301,13 @@ class TreeBuilder(object):
universal = self.cdata_list_attributes.get('*', [])
tag_specific = self.cdata_list_attributes.get(
tag_name.lower(), None)
- for attr in attrs.keys():
+ for attr in list(attrs.keys()):
if attr in universal or (tag_specific and attr in tag_specific):
# We have a "class"-type attribute whose string
# value is a whitespace-separated list of
# values. Split it into a list.
value = attrs[attr]
- if isinstance(value, basestring):
+ if isinstance(value, str):
values = nonwhitespace_re.findall(value)
else:
# html5lib sometimes calls setAttributes twice
@@ -497,7 +497,7 @@ class ParserRejectedMarkup(Exception):
"""
if isinstance(message_or_exception, Exception):
e = message_or_exception
- message_or_exception = "%s: %s" % (e.__class__.__name__, unicode(e))
+ message_or_exception = "%s: %s" % (e.__class__.__name__, str(e))
super(ParserRejectedMarkup, self).__init__(message_or_exception)
# Builders are registered in reverse order of priority, so that custom
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index a1c6134..69aefd7 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -33,7 +33,7 @@ try:
# Pre-0.99999999
from html5lib.treebuilders import _base as treebuilder_base
new_html5lib = False
-except ImportError, e:
+except ImportError as e:
# 0.99999999 and up
from html5lib.treebuilders import base as treebuilder_base
new_html5lib = True
@@ -79,7 +79,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
self.underlying_builder.parser = parser
extra_kwargs = dict()
- if not isinstance(markup, unicode):
+ if not isinstance(markup, str):
if new_html5lib:
extra_kwargs['override_encoding'] = self.user_specified_encoding
else:
@@ -87,13 +87,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
doc = parser.parse(markup, **extra_kwargs)
# Set the character encoding detected by the tokenizer.
- if isinstance(markup, unicode):
+ if isinstance(markup, str):
# We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
else:
original_encoding = parser.tokenizer.stream.charEncoding[0]
- if not isinstance(original_encoding, basestring):
+ if not isinstance(original_encoding, str):
# In 0.99999999 and up, the encoding is an html5lib
# Encoding object. We want to use a string for compatibility
# with other tree builders.
@@ -110,7 +110,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
- return u'<html><head></head><body>%s</body></html>' % fragment
+ return '<html><head></head><body>%s</body></html>' % fragment
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
@@ -217,7 +217,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
rv.append("|%s<%s>" % (' ' * indent, name))
if element.attrs:
attributes = []
- for name, value in element.attrs.items():
+ for name, value in list(element.attrs.items()):
if isinstance(name, NamespacedAttribute):
name = "%s %s" % (prefixes[name.namespace], name.name)
if isinstance(value, list):
@@ -272,7 +272,7 @@ class Element(treebuilder_base.Node):
def appendChild(self, node):
string_child = child = None
- if isinstance(node, basestring):
+ if isinstance(node, str):
# Some other piece of code decided to pass in a string
# instead of creating a TextElement object to contain the
# string.
@@ -289,7 +289,7 @@ class Element(treebuilder_base.Node):
child = node.element
node.parent = self
- if not isinstance(child, basestring) and child.parent is not None:
+ if not isinstance(child, str) and child.parent is not None:
node.element.extract()
if (string_child is not None and self.element.contents
@@ -302,7 +302,7 @@ class Element(treebuilder_base.Node):
old_element.replace_with(new_element)
self.soup._most_recent_element = new_element
else:
- if isinstance(node, basestring):
+ if isinstance(node, str):
# Create a brand new NavigableString from this string.
child = self.soup.new_string(node)
@@ -340,7 +340,7 @@ class Element(treebuilder_base.Node):
self.soup.builder._replace_cdata_list_attribute_values(
self.name, attributes)
- for name, value in attributes.items():
+ for name, value in list(attributes.items()):
self.element[name] = value
# The attributes may contain variables that need substitution.
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 355be11..70e9be8 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -8,11 +8,11 @@ __all__ = [
'HTMLParserTreeBuilder',
]
-from HTMLParser import HTMLParser
+from html.parser import HTMLParser
try:
- from HTMLParser import HTMLParseError
-except ImportError, e:
+ from html.parser import HTMLParseError
+except ImportError as e:
# HTMLParseError is removed in Python 3.5. Since it can never be
# thrown in 3.5, we can just define our own class as a placeholder.
class HTMLParseError(Exception):
@@ -219,14 +219,14 @@ class BeautifulSoupHTMLParser(HTMLParser):
continue
try:
data = bytearray([real_name]).decode(encoding)
- except UnicodeDecodeError, e:
+ except UnicodeDecodeError as e:
pass
if not data:
try:
- data = unichr(real_name)
- except (ValueError, OverflowError), e:
+ data = chr(real_name)
+ except (ValueError, OverflowError) as e:
pass
- data = data or u"\N{REPLACEMENT CHARACTER}"
+ data = data or "\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def handle_entityref(self, name):
@@ -353,7 +353,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
document to Unicode and parsing it. Each strategy will be tried
in turn.
"""
- if isinstance(markup, unicode):
+ if isinstance(markup, str):
# Parse Unicode as-is.
yield (markup, None, None, False)
return
@@ -391,7 +391,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
try:
parser.feed(markup)
parser.close()
- except HTMLParseError, e:
+ except HTMLParseError as e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index c670b84..11c9a69 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -8,11 +8,11 @@ __all__ = [
try:
from collections.abc import Callable # Python 3.6
-except ImportError , e:
+except ImportError as e:
from collections import Callable
from io import BytesIO
-from StringIO import StringIO
+from io import StringIO
from lxml import etree
from bs4.element import (
Comment,
@@ -35,7 +35,7 @@ LXML = 'lxml'
def _invert(d):
"Invert a dictionary."
- return dict((v,k) for k, v in d.items())
+ return dict((v,k) for k, v in list(d.items()))
class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
@@ -81,7 +81,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
:param mapping: A dictionary mapping namespace prefixes to URIs.
"""
- for key, value in mapping.items():
+ for key, value in list(mapping.items()):
if key and key not in self.soup._namespaces:
# Let the BeautifulSoup object know about a new namespace.
# If there are multiple namespaces defined with the same
@@ -169,12 +169,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
else:
self.processing_instruction_class = XMLProcessingInstruction
- if isinstance(markup, unicode):
+ if isinstance(markup, str):
# We were given Unicode. Maybe lxml can parse Unicode on
# this system?
yield markup, None, document_declared_encoding, False
- if isinstance(markup, unicode):
+ if isinstance(markup, str):
# No, apparently not. Convert the Unicode to UTF-8 and
# tell lxml to parse it as UTF-8.
yield (markup.encode("utf8"), "utf8",
@@ -199,7 +199,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def feed(self, markup):
if isinstance(markup, bytes):
markup = BytesIO(markup)
- elif isinstance(markup, unicode):
+ elif isinstance(markup, str):
markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty,
@@ -214,7 +214,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if len(data) != 0:
self.parser.feed(data)
self.parser.close()
- except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+ except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(e)
def close(self):
@@ -243,7 +243,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later.
attrs = attrs.copy()
- for prefix, namespace in nsmap.items():
+ for prefix, namespace in list(nsmap.items()):
attribute = NamespacedAttribute(
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
attrs[attribute] = namespace
@@ -252,7 +252,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# from lxml with namespaces attached to their names, and
# turn then into NamespacedAttribute objects.
new_attrs = {}
- for attr, value in attrs.items():
+ for attr, value in list(attrs.items()):
namespace, attr = self._getNsTag(attr)
if namespace is None:
new_attrs[attr] = value
@@ -312,7 +312,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
- return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
+ return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
@@ -333,10 +333,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
self.parser = self.parser_for(encoding)
self.parser.feed(markup)
self.parser.close()
- except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+ except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(e)
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
- return u'<html><body>%s</body></html>' % fragment
+ return '<html><body>%s</body></html>' % fragment