7 files changed, 86 insertions, 106 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 66a1c02..c036521 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -16,7 +16,6 @@ For more than you ever wanted to know about Beautiful Soup, see the
 documentation:
 http://www.crummy.com/software/BeautifulSoup/documentation.html
 """
-from __future__ import generators
 
 __author__ = "Leonard Richardson (leonardr@segfault.org)"
 __version__ = "4.0.0a"
@@ -27,10 +26,9 @@ __all__ = ['BeautifulSoup']
 
 import re
 
-from util import isList, buildSet
-from builder import builder_registry
-from dammit import UnicodeDammit
-from element import DEFAULT_OUTPUT_ENCODING, NavigableString, Tag
+from .builder import builder_registry
+from .dammit import UnicodeDammit
+from .element import DEFAULT_OUTPUT_ENCODING, NavigableString, Tag
 
 
 class BeautifulSoup(Tag):
@@ -145,7 +143,7 @@ class BeautifulSoup(Tag):
         if self.currentData:
             currentData = u''.join(self.currentData)
             if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
-                not buildSet([tag.name for tag in self.tagStack]).intersection(
+                not set([tag.name for tag in self.tagStack]).intersection(
                     self.builder.preserve_whitespace_tags)):
                 if '\n' in currentData:
                     currentData = '\n'
@@ -161,10 +159,10 @@ class BeautifulSoup(Tag):
 
     def object_was_parsed(self, o):
         """Add an object to the parse tree."""
-        o.setup(self.currentTag, self.previous)
-        if self.previous:
-            self.previous.next = o
-        self.previous = o
+        o.setup(self.currentTag, self.previous_element)
+        if self.previous_element:
+            self.previous_element.next_element = o
+        self.previous_element = o
         self.currentTag.contents.append(o)
 
     def _popToTag(self, name, inclusivePop=True):
@@ -208,12 +206,12 @@ class BeautifulSoup(Tag):
             return None
 
         tag = Tag(self, self.builder, name, attrs, self.currentTag,
-                  self.previous)
+                  self.previous_element)
         if tag is None:
             return tag
-        if self.previous:
-            self.previous.next = tag
-        self.previous = tag
+        if self.previous_element:
+            self.previous_element.next_element = tag
+        self.previous_element = tag
         self.pushTag(tag)
         return tag
 
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index afd49b9..e6d4fa1 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -144,7 +144,7 @@ class SAXTreeBuilder(TreeBuilder):
         pass
 
     def startElement(self, name, attrs):
-        attrs = dict((key[1], value) for key, value in attrs.items())
+        attrs = dict((key[1], value) for key, value in list(attrs.items()))
         #print "Start %s, %r" % (name, attrs)
         self.soup.handle_starttag(name, attrs)
 
@@ -247,16 +247,16 @@ def register_treebuilders_from(module):
 # builder registrations will take precedence. In general, we want
 # html5lib to take precedence over lxml, because it's more
 # reliable. And we only want to use HTMLParser as a last result.
-import _htmlparser
+from .import _htmlparser
 register_treebuilders_from(_htmlparser)
 try:
-    import _lxml
+    from . import _lxml
     register_treebuilders_from(_lxml)
 except ImportError:
     # They don't have lxml installed.
     pass
 try:
-    import _html5lib
+    from . import _html5lib
     register_treebuilders_from(_html5lib)
 except ImportError:
     # They don't have html5lib installed.
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index d74c4b0..e9d7f58 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -102,18 +102,18 @@ class AttrList(object):
         self.element = element
         self.attrs = dict(self.element.attrs)
     def __iter__(self):
-        return self.attrs.items().__iter__()
+        return list(self.attrs.items()).__iter__()
     def __setitem__(self, name, value):
         "set attr", name, value
         self.element[name] = value
     def items(self):
-        return self.attrs.items()
+        return list(self.attrs.items())
     def keys(self):
-        return self.attrs.keys()
+        return list(self.attrs.keys())
     def __getitem__(self, name):
         return self.attrs[name]
     def __contains__(self, name):
-        return name in self.attrs.keys()
+        return name in list(self.attrs.keys())
 
 
 class Element(html5lib.treebuilders._base.Node):
@@ -155,7 +155,7 @@ class Element(html5lib.treebuilders._base.Node):
 
     def setAttributes(self, attributes):
         if attributes is not None and attributes != {}:
-            for name, value in attributes.items():
+            for name, value in list(attributes.items()):
                 self.element[name] =  value
             # The attributes may contain variables that need substitution.
             # Call set_up_substitutions manually.
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 57798f6..07b2032 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -3,6 +3,7 @@ __all__ = [
     'LXMLTreeBuilder',
     ]
 
+import collections
 from lxml import etree
 from bs4.element import Comment, Doctype
 from bs4.builder import (
@@ -36,7 +37,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
         if parser is None:
             # Use the default parser.
             parser = self.default_parser
-        if callable(parser):
+        if isinstance(parser, collections.Callable):
             # Instantiate the parser with default arguments
             parser = parser(target=self, strip_cdata=False)
         self.parser = parser
diff --git a/bs4/dammit.py b/bs4/dammit.py
index f3e770e..ed5dc29 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -9,7 +9,6 @@ encoding; that's the tree builder's job.
 import codecs
 from htmlentitydefs import codepoint2name
 import re
-import types
 
 # Autodetects character encodings. Very useful.
 # Download from http://chardet.feedparser.org/
@@ -37,7 +36,7 @@ class EntitySubstitution(object):
         lookup = {}
         reverse_lookup = {}
         characters = []
-        for codepoint, name in codepoint2name.items():
+        for codepoint, name in list(codepoint2name.items()):
             if codepoint == 34:
                 # There's no point in turning the quotation mark into
                 # &quot;, unless it happens within an attribute value, which
@@ -174,7 +173,7 @@ class UnicodeDammit:
         self.tried_encodings = []
         if markup == '' or isinstance(markup, unicode):
             self.original_encoding = None
-            self.unicode = unicode(markup)
+            self.unicode_markup = unicode(markup)
             return
 
         u = None
@@ -196,7 +195,7 @@ class UnicodeDammit:
                 if u:
                     break
 
-        self.unicode = u
+        self.unicode_markup = u
         if not u:
             self.original_encoding = None
 
@@ -205,7 +204,7 @@ class UnicodeDammit:
         entity."""
         orig = match.group(1)
         sub = self.MS_CHARS.get(orig)
-        if type(sub) == types.TupleType:
+        if type(sub) == tuple:
             if self.smart_quotes_to == 'xml':
                 sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
             else:
@@ -234,7 +233,7 @@ class UnicodeDammit:
             u = self._to_unicode(markup, proposed)
             self.markup = u
             self.original_encoding = proposed
-        except Exception, e:
+        except Exception as e:
             # print "That didn't work!"
             # print e
             return None
@@ -376,7 +375,7 @@ class UnicodeDammit:
                     250,251,252,253,254,255)
             import string
             c.EBCDIC_TO_ASCII_MAP = string.maketrans(
-            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
+            ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap)))
         return s.translate(c.EBCDIC_TO_ASCII_MAP)
 
     MS_CHARS = {'\x80': ('euro', '20AC'),
diff --git a/bs4/element.py b/bs4/element.py
index e141aa8..95661ae 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1,13 +1,7 @@
+import collections
 import re
-import types
-try:
-    from htmlentitydefs import name2codepoint
-except ImportError:
-    name2codepoint = {}
 from bs4.dammit import EntitySubstitution
 
-from util import isList
-
 DEFAULT_OUTPUT_ENCODING = "utf-8"
 
 
@@ -32,12 +26,12 @@ class PageElement(object):
     """Contains the navigational information for some part of the page
     (either a tag or a piece of text)"""
 
-    def setup(self, parent=None, previous=None):
+    def setup(self, parent=None, previous_element=None):
         """Sets up the initial relations between this element and
         other elements."""
         self.parent = parent
-        self.previous = previous
-        self.next = None
+        self.previous_element = previous_element
+        self.next_element = None
         self.previous_sibling = None
         self.next_sibling = None
         if self.parent and self.parent.contents:
@@ -81,14 +75,14 @@ class PageElement(object):
         #this element (and any children) hadn't been parsed. Connect
         #the two.
         last_child = self._last_recursive_child()
-        next_element = last_child.next
+        next_element = last_child.next_element
 
-        if self.previous:
-            self.previous.next = next_element
+        if self.previous_element:
+            self.previous_element.next_element = next_element
         if next_element:
-            next_element.previous = self.previous
-        self.previous = None
-        last_child.next = None
+            next_element.previous_element = self.previous_element
+        self.previous_element = None
+        last_child.next_element = None
 
         self.parent = None
         if self.previous_sibling:
@@ -129,14 +123,14 @@ class PageElement(object):
         previous_child = None
         if position == 0:
             new_child.previous_sibling = None
-            new_child.previous = self
+            new_child.previous_element = self
         else:
             previous_child = self.contents[position - 1]
             new_child.previous_sibling = previous_child
             new_child.previous_sibling.next_sibling = new_child
-            new_child.previous = previous_child._last_recursive_child()
+            new_child.previous_element = previous_child._last_recursive_child()
         if new_child.previous:
-            new_child.previous.next = new_child
+            new_child.previous_element.next_element = new_child
 
         new_childs_last_element = new_child._last_recursive_child()
 
@@ -151,18 +145,18 @@ class PageElement(object):
                 if not parent:  # This is the last element in the document.
                     break
             if parents_next_sibling:
-                new_childs_last_element.next = parents_next_sibling
+                new_childs_last_element.next_element = parents_next_sibling
             else:
-                new_childs_last_element.next = None
+                new_childs_last_element.next_element = None
         else:
             next_child = self.contents[position]
             new_child.next_sibling = next_child
             if new_child.next_sibling:
                 new_child.next_sibling.previous_sibling = new_child
-            new_childs_last_element.next = next_child
+            new_childs_last_element.next_element = next_child
 
-        if new_childs_last_element.next:
-            new_childs_last_element.next.previous = new_childs_last_element
+        if new_childs_last_element.next_element:
+            new_childs_last_element.next_element.previous_element = new_childs_last_element
         self.contents.insert(position, new_child)
 
     def append(self, tag):
@@ -252,6 +246,14 @@ class PageElement(object):
     findParents = find_parents   # BS3
     fetchParents = find_parents  # BS2
 
+    @property
+    def next(self):
+        return self.next_element
+
+    @property
+    def previous(self):
+        return self.previous_element
+
     #These methods do the real heavy lifting.
 
     def _find_one(self, method, name, attrs, text, **kwargs):
@@ -283,7 +285,7 @@ class PageElement(object):
         results = ResultSet(strainer)
         while True:
             try:
-                i = generator.next()
+                i = next(generator)
             except StopIteration:
                 break
             if i:
@@ -300,7 +302,7 @@ class PageElement(object):
     def next_elements(self):
         i = self
         while i is not None:
-            i = i.next
+            i = i.next_element
             yield i
 
     @property
@@ -314,7 +316,7 @@ class PageElement(object):
     def previous_elements(self):
         i = self
         while i is not None:
-            i = i.previous
+            i = i.previous_element
             yield i
 
     @property
@@ -381,8 +383,9 @@ class NavigableString(unicode, PageElement):
         if attr == 'string':
             return self
         else:
-            raise AttributeError("'%s' object has no attribute '%s'" %
-                                 (self.__class__.__name__, attr))
+            raise AttributeError(
+                "'%s' object has no attribute '%s'" % (
+                    self.__class__.__name__, attr))
 
     def output_ready(self, substitute_html_entities=False):
         if substitute_html_entities:
@@ -554,7 +557,7 @@ class Tag(PageElement):
         attribute."""
         return self.attrs.get(key, default)
 
-    def has_key(self, key):
+    def has_attr(self, key):
         return key in self.attrs
 
     def __getitem__(self, key):
@@ -590,7 +593,7 @@ class Tag(PageElement):
         """Calling a tag like a function is the same as calling its
         find_all() method. Eg. tag('a') returns a list of all the A tags
         found within this tag."""
-        return apply(self.find_all, args, kwargs)
+        return self.find_all(args, kwargs)
 
     def __getattr__(self, tag):
         #print "Getattr %s.%s" % (self.__class__, tag)
@@ -598,8 +601,8 @@ class Tag(PageElement):
             return self.find(tag[:-3])
         elif not tag.startswith("__"):
             return self.find(tag)
-        raise AttributeError("'%s' object has no attribute '%s'" %
-                               (self.__class__, tag))
+        raise AttributeError(
+            "'%s' object has no attribute '%s'" % (self.__class__, tag))
 
     def __eq__(self, other):
         """Returns true iff this tag has the same name, the same attributes,
@@ -782,11 +785,11 @@ class Tag(PageElement):
     def recursive_children(self):
         if not len(self.contents):
             return
-        stopNode = self._last_recursive_child().next
+        stopNode = self._last_recursive_child().next_element
         current = self.contents[0]
         while current is not stopNode:
             yield current
-            current = current.next
+            current = current.next_element
 
     # Old names for backwards compatibility
     def childGenerator(self):
@@ -795,6 +798,10 @@ class Tag(PageElement):
     def recursiveChildGenerator(self):
         return self.recursive_children
 
+    # This was kind of misleading because has_key() (attributes) was
+    # different from __in__ (contents). has_key() is gone in Python 3,
+    # anyway.
+    has_key = has_attr
 
 # Next, a couple classes to represent queries and their results.
 class SoupStrainer(object):
@@ -827,19 +834,20 @@ class SoupStrainer(object):
         if isinstance(markup_name, Tag):
             markup = markup_name
             markup_attrs = markup
-        call_function_with_tag_data = callable(self.name) \
-                                and not isinstance(markup_name, Tag)
-
-        if (not self.name) \
-               or call_function_with_tag_data \
-               or (markup and self._matches(markup, self.name)) \
-               or (not markup and self._matches(markup_name, self.name)):
+        call_function_with_tag_data = (
+            isinstance(self.name, collections.Callable)
+            and not isinstance(markup_name, Tag))
+
+        if ((not self.name)
+            or call_function_with_tag_data
+            or (markup and self._matches(markup, self.name))
+            or (not markup and self._matches(markup_name, self.name))):
             if call_function_with_tag_data:
                 match = self.name(markup_name, markup_attrs)
             else:
                 match = True
                 markup_attr_map = None
-                for attr, match_against in self.attrs.items():
+                for attr, match_against in list(self.attrs.items()):
                     if not markup_attr_map:
                         if hasattr(markup_attrs, 'get'):
                             markup_attr_map = markup_attrs
@@ -864,7 +872,7 @@ class SoupStrainer(object):
         found = None
         # If given a list of items, scan it for a text element that
         # matches.
-        if isList(markup) and not isinstance(markup, Tag):
+        if hasattr(markup, '__iter__') and not isinstance(markup, Tag):
             for element in markup:
                 if isinstance(element, NavigableString) \
                        and self.search(element):
@@ -881,8 +889,8 @@ class SoupStrainer(object):
             if self._matches(markup, self.text):
                 found = markup
         else:
-            raise Exception("I don't know how to match against a %s"
-                  % markup.__class__)
+            raise Exception(
+                "I don't know how to match against a %s" % markup.__class__)
         return found
 
     def _matches(self, markup, match_against):
@@ -890,7 +898,7 @@ class SoupStrainer(object):
         result = False
         if match_against is True:
             result = markup is not None
-        elif callable(match_against):
+        elif isinstance(match_against, collections.Callable):
             result = match_against(markup)
         else:
             #Custom match methods take the tag as an argument, but all
@@ -903,17 +911,14 @@ class SoupStrainer(object):
             if hasattr(match_against, 'match'):
                 # It's a regexp object.
                 result = markup and match_against.search(markup)
-            elif (isList(match_against)
+            elif (hasattr(match_against, '__iter__')
                   and (markup is not None
                        or not isinstance(match_against, basestring))):
                 result = markup in match_against
             elif hasattr(match_against, 'items'):
                 result = match_against in markup
             elif match_against and isinstance(markup, basestring):
-                if isinstance(markup, unicode):
-                    match_against = unicode(match_against)
-                else:
-                    match_against = str(match_against)
+                match_against = markup.__class__(match_against)
 
             if not result:
                 result = match_against == markup
diff --git a/bs4/util.py b/bs4/util.py
deleted file mode 100644
index 8e33273..0000000
--- a/bs4/util.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Helper functions and mixin classes for Beautiful Soup
-
-import types
-try:
-    set
-except NameError:
-    from sets import Set as set
-
-
-def isList(l):
-    """Convenience method that works with all 2.x versions of Python
-    to determine whether or not something is listlike."""
-    return ((hasattr(l, '__iter__') and not isinstance(l, basestring))
-            or (type(l) in (types.ListType, types.TupleType)))
-
-
-def buildSet(args=None):
-    """Turns a list or a string into a set."""
-    if isinstance(args, str):
-        return set([args])
-    if args is None:
-        return set()
-    return set(args)