6 files changed, 903 insertions, 23 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
new file mode 100644
index 0000000..17dcff3
--- /dev/null
+++ b/bs4/builder/__init__.py
@@ -0,0 +1,263 @@
+from collections import defaultdict
+import re
+import sys
+
+__all__ = [
+    'HTMLTreeBuilder',
+    'SAXTreeBuilder',
+    'TreeBuilder',
+    'TreeBuilderRegistry',
+    ]
+
+# Some useful features for a TreeBuilder to have.
+FAST = 'fast'
+PERMISSIVE = 'permissive'
+STRICT = 'strict'
+XML = 'xml'
+HTML = 'html'
+HTML_5 = 'html5'
+
+
+class TreeBuilderRegistry(object):
+
+    def __init__(self):
+        self.builders_for_feature = defaultdict(list)
+        self.builders = []
+
+    def register(self, treebuilder_class):
+        """Register a treebuilder based on its advertised features."""
+        for feature in treebuilder_class.features:
+            self.builders_for_feature[feature].insert(0, treebuilder_class)
+        self.builders.insert(0, treebuilder_class)
+
+    def lookup(self, *features):
+        if len(self.builders) == 0:
+            # There are no builders at all.
+            return None
+
+        if len(features) == 0:
+            # They didn't ask for any features. Give them the most
+            # recently registered builder.
+            return self.builders[0]
+
+        # Go down the list of features in order, and eliminate any builders
+        # that don't match every feature.
+        features = list(features)
+        features.reverse()
+        candidates = None
+        candidate_set = None
+        while len(features) > 0:
+            feature = features.pop()
+            we_have_the_feature = self.builders_for_feature.get(feature, [])
+            if len(we_have_the_feature) > 0:
+                if candidates is None:
+                    candidates = we_have_the_feature
+                    candidate_set = set(candidates)
+                else:
+                    # Eliminate any candidates that don't have this feature.
+                    candidate_set = candidate_set.intersection(
+                        set(we_have_the_feature))
+
+        # The only valid candidates are the ones in candidate_set.
+        # Go through the original list of candidates and pick the first one
+        # that's in candidate_set.
+        if candidate_set is None:
+            return None
+        for candidate in candidates:
+            if candidate in candidate_set:
+                return candidate
+        return None
+
+# The BeautifulSoup class will take feature lists from developers and use them
+# to look up builders in this registry.
+builder_registry = TreeBuilderRegistry()
+
+
+class TreeBuilder(object):
+    """Turn a document into a Beautiful Soup object tree."""
+
+    features = []
+
+    is_xml = False
+    preserve_whitespace_tags = set()
+    empty_element_tags = None # A tag will be considered an empty-element
+                              # tag when and only when it has no contents.
+
+    def __init__(self):
+        self.soup = None
+
+    def reset(self):
+        pass
+
+    def can_be_empty_element(self, tag_name):
+        """Might a tag with this name be an empty-element tag?
+
+        The final markup may or may not actually present this tag as
+        self-closing.
+
+        For instance: an HTMLBuilder does not consider a <p> tag to be
+        an empty-element tag (it's not in
+        HTMLBuilder.empty_element_tags). This means an empty <p> tag
+        will be presented as "<p></p>", not "<p />".
+
+        The default implementation has no opinion about which tags are
+        empty-element tags, so a tag will be presented as an
+        empty-element tag if and only if it has no contents.
+        "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
+        be left alone.
+        """
+        if self.empty_element_tags is None:
+            return True
+        return tag_name in self.empty_element_tags
+
+    def feed(self, markup):
+        raise NotImplementedError()
+
+    def prepare_markup(self, markup, user_specified_encoding=None,
+                       document_declared_encoding=None):
+        return markup, None, None
+
+    def test_fragment_to_document(self, fragment):
+        """Wrap an HTML fragment to make it look like a document.
+
+        Different parsers do this differently. For instance, lxml
+        introduces an empty <head> tag, and html5lib
+        doesn't. Abstracting this away lets us write simple tests
+        which run HTML fragments through the parser and compare the
+        results against other HTML fragments.
+
+        This method should not be used outside of tests.
+        """
+        return fragment
+
+    def set_up_substitutions(self, tag):
+        pass
+
+
+class SAXTreeBuilder(TreeBuilder):
+    """A Beautiful Soup treebuilder that listens for SAX events."""
+
+    def feed(self, markup):
+        raise NotImplementedError()
+
+    def close(self):
+        pass
+
+    def startElement(self, name, attrs):
+        attrs = dict((key[1], value) for key, value in attrs.items())
+        #print "Start %s, %r" % (name, attrs)
+        self.soup.handle_starttag(name, attrs)
+
+    def endElement(self, name):
+        #print "End %s" % name
+        self.soup.handle_endtag(name)
+
+    def startElementNS(self, nsTuple, nodeName, attrs):
+        # Throw away (ns, nodeName) for now.
+        self.startElement(nodeName, attrs)
+
+    def endElementNS(self, nsTuple, nodeName):
+        # Throw away (ns, nodeName) for now.
+        self.endElement(nodeName)
+        #handler.endElementNS((ns, node.nodeName), node.nodeName)
+
+    def startPrefixMapping(self, prefix, nodeValue):
+        # Ignore the prefix for now.
+        pass
+
+    def endPrefixMapping(self, prefix):
+        # Ignore the prefix for now.
+        # handler.endPrefixMapping(prefix)
+        pass
+
+    def characters(self, content):
+        self.soup.handle_data(content)
+
+    def startDocument(self):
+        pass
+
+    def endDocument(self):
+        pass
+
+
+class HTMLTreeBuilder(TreeBuilder):
+    """This TreeBuilder knows facts about HTML.
+
+    Such as which tags are empty-element tags.
+    """
+
+    preserve_whitespace_tags = set(['pre', 'textarea'])
+    empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
+                              'spacer', 'link', 'frame', 'base'])
+
+    # Used by set_up_substitutions to detect the charset in a META tag
+    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+
+    def set_up_substitutions(self, tag):
+        if tag.name != 'meta':
+            return False
+
+        http_equiv = tag.get('http-equiv')
+        content = tag.get('content')
+
+        if (http_equiv is not None
+            and content is not None
+            and http_equiv.lower() == 'content-type'):
+            # This is an interesting meta tag.
+            match = self.CHARSET_RE.search(content)
+            if match:
+                if (self.soup.declared_html_encoding is not None or
+                    self.soup.original_encoding == self.soup.from_encoding):
+                    # An HTML encoding was sniffed while converting
+                    # the document to Unicode, or an HTML encoding was
+                    # sniffed during a previous pass through the
+                    # document, or an encoding was specified
+                    # explicitly and it worked. Rewrite the meta tag.
+                    def rewrite(match):
+                        return match.group(1) + "%SOUP-ENCODING%"
+                    tag['content'] = self.CHARSET_RE.sub(rewrite, content)
+                    return True
+                else:
+                    # This is our first pass through the document.
+                    # Go through it again with the encoding information.
+                    new_charset = match.group(3)
+                    if (new_charset is not None
+                        and new_charset != self.soup.original_encoding):
+                        self.soup.declared_html_encoding = new_charset
+                        self.soup._feed(self.soup.declared_html_encoding)
+                        raise StopParsing
+                    pass
+        return False
+
+
+def register_treebuilders_from(module):
+    """Copy TreeBuilders from the given module into this module."""
+    # I'm fairly sure this is not the best way to do this.
+    this_module = sys.modules[__package__]
+    for name in module.__all__:
+        obj = getattr(module, name)
+
+        if issubclass(obj, TreeBuilder):
+            setattr(this_module, name, obj)
+            this_module.__all__.append(name)
+            # Register the builder while we're at it.
+            this_module.builder_registry.register(obj)
+
+# Builders are registered in reverse order of priority, so that custom
+# builder registrations will take precedence. In general, we want
+# html5lib to take precedence over lxml, because it's more
+# reliable. And we only want to use HTMLParser as a last result.
+import _htmlparser
+register_treebuilders_from(_htmlparser)
+try:
+    import _lxml
+    register_treebuilders_from(_lxml)
+except ImportError:
+    # They don't have lxml installed.
+    pass
+try:
+    import _html5lib
+    register_treebuilders_from(_html5lib)
+except ImportError:
+    # They don't have html5lib installed.
+    pass
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
new file mode 100644
index 0000000..c293d9e
--- /dev/null
+++ b/bs4/builder/_htmlparser.py
@@ -0,0 +1,94 @@
+"""Use the HTMLParser library to parse HTML files that aren't too bad."""
+
+__all__ = [
+    'HTMLParserTreeBuilder',
+    ]
+
+from HTMLParser import HTMLParser
+from bs4.element import (
+    CData,
+    Comment,
+    Declaration,
+    Doctype,
+    ProcessingInstruction,
+    )
+from bs4.dammit import EntitySubstitution, UnicodeDammit
+
+from bs4.builder import (
+    HTML,
+    HTMLTreeBuilder,
+    STRICT,
+    )
+
+
+HTMLPARSER = 'html.parser'
+
+class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder):
+
+    is_xml = False
+    features = [HTML, STRICT, HTMLPARSER]
+
+    def prepare_markup(self, markup, user_specified_encoding=None,
+                       document_declared_encoding=None):
+        """
+        :return: A 3-tuple (markup, original encoding, encoding
+        declared within markup).
+        """
+        if isinstance(markup, unicode):
+            return markup, None, None
+
+        try_encodings = [user_specified_encoding, document_declared_encoding]
+        dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
+        return (dammit.markup, dammit.original_encoding,
+                dammit.declared_html_encoding)
+
+    def feed(self, markup):
+        super(HTMLParserTreeBuilder, self).feed(markup)
+
+    def handle_starttag(self, name, attrs):
+        self.soup.handle_starttag(name, dict(attrs))
+
+    def handle_endtag(self, name):
+        self.soup.handle_endtag(name)
+
+    def handle_data(self, data):
+        self.soup.handle_data(data)
+
+    def handle_charref(self, name):
+        self.handle_data(unichr(int(name)))
+
+    def handle_entityref(self, name):
+        character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
+        if character is not None:
+            data = character
+        else:
+            data = "&%s;" % name
+        self.handle_data(data)
+
+    def handle_comment(self, data):
+        self.soup.endData()
+        self.soup.handle_data(data)
+        self.soup.endData(Comment)
+
+    def handle_decl(self, data):
+        self.soup.endData()
+        if data.startswith("DOCTYPE "):
+            data = data[len("DOCTYPE "):]
+        self.soup.handle_data(data)
+        self.soup.endData(Doctype)
+
+    def unknown_decl(self, data):
+        if data.upper().startswith('CDATA['):
+            cls = CData
+            data = data[len('CDATA['):]
+        else:
+            cls = Declaration
+        self.soup.endData()
+        self.soup.handle_data(data)
+        self.soup.endData(cls)
+
+    def handle_pi(self, data):
+        self.soup.endData()
+        self.soup.handle_data(data)
+        self.soup.endData(ProcessingInstruction)
+
diff --git a/bs4/dammit.py b/bs4/dammit.py
new file mode 100644
index 0000000..75d445e
--- /dev/null
+++ b/bs4/dammit.py
@@ -0,0 +1,412 @@
+"""Beautiful Soup bonus library: Unicode, Dammit
+
+This class forces XML data into a standard format (usually to UTF-8 or
+Unicode).  It is heavily based on code from Mark Pilgrim's Universal
+Feed Parser. It does not rewrite the XML or HTML to reflect a new
+encoding; that's the tree builder's job.
+"""
+
+import codecs
+from htmlentitydefs import codepoint2name
+import re
+import types
+
+# Autodetects character encodings. Very useful.
+# Download from http://chardet.feedparser.org/
+#  or 'apt-get install python-chardet'
+#  or 'easy_install chardet'
+try:
+    import chardet
+    #import chardet.constants
+    #chardet.constants._debug = 1
+except ImportError:
+    chardet = None
+
+# Available from http://cjkpython.i18n.org/.
+try:
+    import iconv_codec
+except ImportError:
+    pass
+
+
+class EntitySubstitution(object):
+
+    """Substitute XML or HTML entities for the corresponding characters."""
+
+    def _populate_class_variables():
+        lookup = {}
+        reverse_lookup = {}
+        characters = []
+        for codepoint, name in codepoint2name.items():
+            if codepoint == 34:
+                # There's no point in turning the quotation mark into
+                # &quot;, unless it happens within an attribute value, which
+                # is handled elsewhere.
+                continue;
+            character = unichr(codepoint)
+            characters.append(character)
+            lookup[character] = name
+            reverse_lookup[name] = character
+        re_definition = "[%s]" % "".join(characters)
+        return lookup, reverse_lookup, re.compile(re_definition)
+    (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
+     CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
+
+
+    CHARACTER_TO_XML_ENTITY = {
+        "'" : "apos",
+        '"' : "quot",
+        "&" : "amp",
+        "<" : "lt",
+        ">" : "gt",
+        }
+
+    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+                                           ")")
+
+    @classmethod
+    def _substitute_html_entity(cls, matchobj):
+        entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
+        return "&%s;" % entity
+
+    @classmethod
+    def _substitute_xml_entity(cls, matchobj):
+        """Used with a regular expression to substitute the
+        appropriate XML entity for an XML special character."""
+        entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
+        return "&%s;" % entity
+
+    @classmethod
+    def substitute_xml(cls, value, make_quoted_attribute=False):
+        """Substitute XML entities for special XML characters.
+
+        :param value: A string to be substituted. The less-than sign will
+          become &lt;, the greater-than sign will become &gt;, and any
+          ampersands that are not part of an entity defition will
+          become &amp;.
+
+        :param make_quoted_attribute: If True, then the string will be
+         quoted, as befits an attribute value.
+
+         Ordinarily, the string will be quoted using double quotes.
+
+          Bob's Bar -> "Bob's Bar"
+
+         If the string contains double quotes, it will be quoted using
+         single quotes.
+
+          Welcome to "my bar" -> 'Welcome to "my bar"'
+
+         If the string contains both single and double quotes, the
+         double quotes will be escaped, and the string will be quoted
+         using double quotes.
+
+          Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
+        """
+        if make_quoted_attribute:
+            quote_with = '"'
+            if '"' in value:
+                if "'" in value:
+                    # The string contains both single and double
+                    # quotes.  Turn the double quotes into
+                    # entities. We quote the double quotes rather than
+                    # the single quotes because the entity name is
+                    # "&quot;" whether this is HTML or XML.  If we
+                    # quoted the single quotes, we'd have to decide
+                    # between &apos; and &squot;.
+                    replace_with = "&quot;"
+                    value = value.replace('"', replace_with)
+                else:
+                    # There are double quotes but no single quotes.
+                    # We can use single quotes to quote the attribute.
+                    quote_with = "'"
+
+        # Escape angle brackets, and ampersands that aren't part of
+        # entities.
+        value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
+            cls._substitute_xml_entity, value)
+        if make_quoted_attribute:
+            return quote_with + value + quote_with
+        else:
+            return value
+
+    @classmethod
+    def substitute_html(cls, s):
+        """Replace certain Unicode characters with named HTML entities.
+
+        This differs from data.encode(encoding, 'xmlcharrefreplace')
+        in that the goal is to make the result more readable (to those
+        with ASCII displays) rather than to recover from
+        errors. There's absolutely nothing wrong with a UTF-8 string
+        containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
+        character with "&eacute;" will make it more readable to some
+        people.
+        """
+        return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
+            cls._substitute_html_entity, s)
+
+
+class UnicodeDammit:
+    """A class for detecting the encoding of a *ML document and
+    converting it to a Unicode string. If the source encoding is
+    windows-1252, can replace MS smart quotes with their HTML or XML
+    equivalents."""
+
+    # This dictionary maps commonly seen values for "charset" in HTML
+    # meta tags to the corresponding Python codec names. It only covers
+    # values that aren't in Python's aliases and can't be determined
+    # by the heuristics in find_codec.
+    CHARSET_ALIASES = { "macintosh" : "mac-roman",
+                        "x-sjis" : "shift-jis" }
+
+    ENCODINGS_WITH_SMART_QUOTES = [
+        "windows-1252",
+        "iso-8859-1",
+        "iso-8859-2",
+        ]
+
+    def __init__(self, markup, override_encodings=[],
+                 smart_quotes_to=None, isHTML=False):
+        self.declared_html_encoding = None
+        self.markup, document_encoding, sniffed_encoding = \
+                     self._detectEncoding(markup, isHTML)
+        self.smart_quotes_to = smart_quotes_to
+        self.tried_encodings = []
+        if markup == '' or isinstance(markup, unicode):
+            self.original_encoding = None
+            self.unicode = unicode(markup)
+            return
+
+        u = None
+        for proposed_encoding in (
+            override_encodings + [document_encoding, sniffed_encoding]):
+            if proposed_encoding is not None:
+                u = self._convert_from(proposed_encoding)
+                if u:
+                    break
+
+        # If no luck and we have auto-detection library, try that:
+        if not u and chardet and not isinstance(self.markup, unicode):
+            u = self._convert_from(chardet.detect(self.markup)['encoding'])
+
+        # As a last resort, try utf-8 and windows-1252:
+        if not u:
+            for proposed_encoding in ("utf-8", "windows-1252"):
+                u = self._convert_from(proposed_encoding)
+                if u:
+                    break
+
+        self.unicode = u
+        if not u: self.original_encoding = None
+
+    def _sub_ms_char(self, match):
+        """Changes a MS smart quote character to an XML or HTML
+        entity."""
+        orig = match.group(1)
+        sub = self.MS_CHARS.get(orig)
+        if type(sub) == types.TupleType:
+            if self.smart_quotes_to == 'xml':
+                sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
+            else:
+                sub = '&'.encode() + sub[0].encode() + ';'.encode()
+        else:
+            sub = sub.encode()
+        return sub
+
+    def _convert_from(self, proposed):
+        proposed = self.find_codec(proposed)
+        if not proposed or proposed in self.tried_encodings:
+            return None
+        self.tried_encodings.append(proposed)
+        markup = self.markup
+
+        # Convert smart quotes to HTML if coming from an encoding
+        # that might have them.
+        if (self.smart_quotes_to is not None
+            and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES):
+            smart_quotes_re = "([\x80-\x9f])"
+            smart_quotes_compiled = re.compile(smart_quotes_re)
+            markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
+
+        try:
+            # print "Trying to convert document to %s" % proposed
+            u = self._to_unicode(markup, proposed)
+            self.markup = u
+            self.original_encoding = proposed
+        except Exception, e:
+            # print "That didn't work!"
+            # print e
+            return None
+        #print "Correct encoding: %s" % proposed
+        return self.markup
+
+    def _to_unicode(self, data, encoding):
+        '''Given a string and its encoding, decodes the string into Unicode.
+        %encoding is a string recognized by encodings.aliases'''
+
+        # strip Byte Order Mark (if present)
+        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
+               and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16be'
+            data = data[2:]
+        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
+                 and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16le'
+            data = data[2:]
+        elif data[:3] == '\xef\xbb\xbf':
+            encoding = 'utf-8'
+            data = data[3:]
+        elif data[:4] == '\x00\x00\xfe\xff':
+            encoding = 'utf-32be'
+            data = data[4:]
+        elif data[:4] == '\xff\xfe\x00\x00':
+            encoding = 'utf-32le'
+            data = data[4:]
+        newdata = unicode(data, encoding)
+        return newdata
+
+    def _detectEncoding(self, xml_data, isHTML=False):
+        """Given a document, tries to detect its XML encoding."""
+        xml_encoding = sniffed_xml_encoding = None
+        try:
+            if xml_data[:4] == '\x4c\x6f\xa7\x94':
+                # EBCDIC
+                xml_data = self._ebcdic_to_ascii(xml_data)
+            elif xml_data[:4] == '\x00\x3c\x00\x3f':
+                # UTF-16BE
+                sniffed_xml_encoding = 'utf-16be'
+                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
+            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
+                     and (xml_data[2:4] != '\x00\x00'):
+                # UTF-16BE with BOM
+                sniffed_xml_encoding = 'utf-16be'
+                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+            elif xml_data[:4] == '\x3c\x00\x3f\x00':
+                # UTF-16LE
+                sniffed_xml_encoding = 'utf-16le'
+                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
+                     (xml_data[2:4] != '\x00\x00'):
+                # UTF-16LE with BOM
+                sniffed_xml_encoding = 'utf-16le'
+                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+            elif xml_data[:4] == '\x00\x00\x00\x3c':
+                # UTF-32BE
+                sniffed_xml_encoding = 'utf-32be'
+                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+            elif xml_data[:4] == '\x3c\x00\x00\x00':
+                # UTF-32LE
+                sniffed_xml_encoding = 'utf-32le'
+                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+            elif xml_data[:4] == '\x00\x00\xfe\xff':
+                # UTF-32BE with BOM
+                sniffed_xml_encoding = 'utf-32be'
+                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+            elif xml_data[:4] == '\xff\xfe\x00\x00':
+                # UTF-32LE with BOM
+                sniffed_xml_encoding = 'utf-32le'
+                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+            elif xml_data[:3] == '\xef\xbb\xbf':
+                # UTF-8 with BOM
+                sniffed_xml_encoding = 'utf-8'
+                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+            else:
+                sniffed_xml_encoding = 'ascii'
+                pass
+        except:
+            xml_encoding_match = None
+        xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
+        xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
+        if not xml_encoding_match and isHTML:
+            meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
+            regexp = re.compile(meta_re, re.I)
+            xml_encoding_match = regexp.search(xml_data)
+        if xml_encoding_match is not None:
+            xml_encoding = xml_encoding_match.groups()[0].decode(
+                'ascii').lower()
+            if isHTML:
+                self.declared_html_encoding = xml_encoding
+            if sniffed_xml_encoding and \
+               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
+                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
+                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
+                                 'utf16', 'u16')):
+                xml_encoding = sniffed_xml_encoding
+        return xml_data, xml_encoding, sniffed_xml_encoding
+
+
+    def find_codec(self, charset):
+        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
+               or (charset and self._codec(charset.replace("-", ""))) \
+               or (charset and self._codec(charset.replace("-", "_"))) \
+               or charset
+
+    def _codec(self, charset):
+        if not charset: return charset
+        codec = None
+        try:
+            codecs.lookup(charset)
+            codec = charset
+        except (LookupError, ValueError):
+            pass
+        return codec
+
+    EBCDIC_TO_ASCII_MAP = None
+    def _ebcdic_to_ascii(self, s):
+        c = self.__class__
+        if not c.EBCDIC_TO_ASCII_MAP:
+            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
+                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
+                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
+                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
+                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
+                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
+                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
+                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
+                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
+                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
+                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
+                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
+                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
+                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
+                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
+                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
+                    250,251,252,253,254,255)
+            import string
+            c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
+            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
+        return s.translate(c.EBCDIC_TO_ASCII_MAP)
+
+    MS_CHARS = { '\x80' : ('euro', '20AC'),
+                 '\x81' : ' ',
+                 '\x82' : ('sbquo', '201A'),
+                 '\x83' : ('fnof', '192'),
+                 '\x84' : ('bdquo', '201E'),
+                 '\x85' : ('hellip', '2026'),
+                 '\x86' : ('dagger', '2020'),
+                 '\x87' : ('Dagger', '2021'),
+                 '\x88' : ('circ', '2C6'),
+                 '\x89' : ('permil', '2030'),
+                 '\x8A' : ('Scaron', '160'),
+                 '\x8B' : ('lsaquo', '2039'),
+                 '\x8C' : ('OElig', '152'),
+                 '\x8D' : '?',
+                 '\x8E' : ('#x17D', '17D'),
+                 '\x8F' : '?',
+                 '\x90' : '?',
+                 '\x91' : ('lsquo', '2018'),
+                 '\x92' : ('rsquo', '2019'),
+                 '\x93' : ('ldquo', '201C'),
+                 '\x94' : ('rdquo', '201D'),
+                 '\x95' : ('bull', '2022'),
+                 '\x96' : ('ndash', '2013'),
+                 '\x97' : ('mdash', '2014'),
+                 '\x98' : ('tilde', '2DC'),
+                 '\x99' : ('trade', '2122'),
+                 '\x9a' : ('scaron', '161'),
+                 '\x9b' : ('rsaquo', '203A'),
+                 '\x9c' : ('oelig', '153'),
+                 '\x9d' : '?',
+                 '\x9e' : ('#x17E', '17E'),
+                 '\x9f' : ('Yuml', ''),}
diff --git a/tests/test_builder_registry.py b/tests/test_builder_registry.py
index ee5b2da..655cd06 100644
--- a/tests/test_builder_registry.py
+++ b/tests/test_builder_registry.py
@@ -8,7 +8,8 @@ from bs4.builder import (
     LXMLTreeBuilderForXML,
     LXMLTreeBuilder,
     TreeBuilderRegistry,
-    HTML5TreeBuilder
+    HTML5TreeBuilder,
+    HTMLParserTreeBuilder,
 )
 
 
@@ -21,6 +22,8 @@ class BuiltInRegistryTest(unittest.TestCase):
                           LXMLTreeBuilder)
         self.assertEquals(registry.lookup('permissive', 'xml'),
                           LXMLTreeBuilderForXML)
+        self.assertEquals(registry.lookup('strict', 'html'),
+                          HTMLParserTreeBuilder)
         self.assertEquals(registry.lookup('permissive', 'html'),
                           HTML5TreeBuilder)
 
@@ -36,6 +39,9 @@ class BuiltInRegistryTest(unittest.TestCase):
         self.assertEquals(registry.lookup('html5lib'),
                           HTML5TreeBuilder)
 
+        self.assertEquals(registry.lookup('html.parser'),
+                          HTMLParserTreeBuilder)
+
     def test_unimplemented_combinations(self):
         self.assertEquals(registry.lookup('fast', 'permissive', 'html'),
                           None)
diff --git a/tests/test_htmlparser.py b/tests/test_htmlparser.py
new file mode 100644
index 0000000..c8a446e
--- /dev/null
+++ b/tests/test_htmlparser.py
@@ -0,0 +1,126 @@
+from HTMLParser import HTMLParseError
+from bs4.builder import HTMLParserTreeBuilder
+from bs4.element import CData
+from test_lxml import (
+    TestLXMLBuilder,
+    TestLXMLBuilderEncodingConversion,
+    TestLXMLBuilderInvalidMarkup,
+    )
+
+class TestHTMLParserTreeBuilder(TestLXMLBuilder):
+    """See `BuilderSmokeTest`."""
+
+    @property
+    def default_builder(self):
+        return HTMLParserTreeBuilder()
+
+    def test_bare_string(self):
+        # A bare string is turned into some kind of HTML document or
+        # fragment recognizable as the original string.
+        #
+        # HTMLParser does not modify the bare string at all.
+        self.assertSoupEquals("A bare string")
+
+    def test_cdata_where_its_ok(self):
+        # HTMLParser recognizes CDATA sections and passes them through.
+        markup = "<svg><![CDATA[foobar]]></svg>"
+        self.assertSoupEquals(markup)
+        soup = self.soup(markup)
+        string = soup.svg.string
+        self.assertEquals(string, "foobar")
+        self.assertTrue(isinstance(string, CData))
+
+    # These are tests that could be 'fixed' by improving the
+    # HTMLParserTreeBuilder, but I don't think it's worth it. Users
+    # will have fewer headaches if they use one of the other tree
+    # builders.
+
+    def test_empty_element(self):
+        # HTML's empty-element tags are not recognized as such
+        # unless they are presented as empty-element tags.
+        self.assertSoupEquals(
+            "<p>A <meta> tag</p>", "<p>A <meta> tag</meta></p>")
+
+        self.assertSoupEquals(
+            "<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>")
+
+    def test_entities_in_attribute_values_converted_during_parsing(self):
+
+        # The numeric entity isn't recognized without the closing
+        # semicolon.
+        text = '<x t="pi&#241ata">'
+        expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata"
+        soup = self.soup(text)
+        self.assertEquals(soup.x['t'], "pi&#241ata")
+
+        text = '<x t="pi&#241;ata">'
+        expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata"
+        soup = self.soup(text)
+        self.assertEquals(soup.x['t'], u"pi\xf1ata")
+
+        text = '<x t="pi&#xf1;ata">'
+        soup = self.soup(text)
+        self.assertEquals(soup.x['t'], expected)
+
+        text = '<x t="sacr&eacute; bleu">'
+        soup = self.soup(text)
+        self.assertEquals(
+            soup.x['t'],
+            u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu")
+
+        # This can cause valid HTML to become invalid.
+        valid_url = '<a href="http://example.org?a=1&amp;b=2;3">foo</a>'
+        soup = self.soup(valid_url)
+        self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3")
+
+    # I think it would be very difficult to 'fix' these tests, judging
+    # from my experience with previous versions of Beautiful Soup.
+    def test_naked_ampersands(self):
+        # Ampersands are treated as entities.
+        text = "<p>AT&T</p>"
+        soup = self.soup(text)
+        self.assertEquals(soup.p.string, "AT&T;")
+
+    def test_literal_in_textarea(self):
+        # Anything inside a <textarea> is supposed to be treated as
+        # the literal value of the field, (XXX citation
+        # needed). html5lib does this correctly. But, HTMLParser does its
+        # best to parse the contents of a <textarea> as HTML.
+        text = '<textarea>Junk like <b> tags and <&<&amp;</textarea>'
+        soup = self.soup(text)
+        self.assertEquals(len(soup.textarea.contents), 2)
+        self.assertEquals(soup.textarea.contents[0], u"Junk like ")
+        self.assertEquals(soup.textarea.contents[1].name, 'b')
+        self.assertEquals(soup.textarea.b.string, u" tags and <&<&")
+
+    def test_literal_in_script(self):
+        # The contents of a <script> tag are supposed to be treated as
+        # a literal string, even if that string contains HTML. But
+        # HTMLParser attempts to parse some of the HTML, causing much
+        # pain.
+        javascript = 'if (i < 2) { alert("<b>foo</b>"); }'
+        soup = self.soup('<script>%s</script>' % javascript)
+        self.assertEquals(soup.script.contents,
+                          ['if (i < 2) { alert("<b>foo',
+                           '"); }'])
+
+    # Namespaced doctypes cause an HTMLParseError
+    def test_namespaced_system_doctype(self):
+        self.assertRaises(HTMLParseError, self._test_doctype,
+                          'xsl:stylesheet SYSTEM "htmlent.dtd"')
+
+    def test_namespaced_public_doctype(self):
+        self.assertRaises(HTMLParseError, self._test_doctype,
+                          'xsl:stylesheet PUBLIC "htmlent.dtd"')
+
+
+class TestHTMLParserTreeBuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
+    # Oddly enough, HTMLParser seems to handle invalid markup exactly
+    # the same as lxml.
+    pass
+
+
+class TestHTMLParserTreeBuilderEncodingConversion(
+    TestLXMLBuilderEncodingConversion):
+    # Re-run the lxml tests for HTMLParser
+    pass
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 0eec688..7e83eff 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -168,27 +168,6 @@ class TestLXMLBuilder(SoupTest):
         expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
         self.assertSoupEquals(text, expected)
 
-    def test_entities_in_attribute_values_converted_during_parsing(self):
-        text = '<x t="pi&#241ata">'
-        expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata"
-        soup = self.soup(text)
-        self.assertEquals(soup.x['t'], expected)
-
-        text = '<x t="pi&#xf1;ata">'
-        soup = self.soup(text)
-        self.assertEquals(soup.x['t'], expected)
-
-        text = '<x t="sacr&eacute; bleu">'
-        soup = self.soup(text)
-        self.assertEquals(
-            soup.x['t'],
-            u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu")
-
-        # This can cause valid HTML to become invalid.
-        valid_url = '<a href="http://example.org?a=1&amp;b=2;3">foo</a>'
-        soup = self.soup(valid_url)
-        self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3")
-
     def test_smart_quotes_converted_on_the_way_in(self):
         # Microsoft smart quotes are converted to Unicode characters during
         # parsing.
@@ -230,7 +209,7 @@ class TestLXMLBuilder(SoupTest):
         # Test a namespaced doctype with a system id.
         self._test_doctype('xsl:stylesheet SYSTEM "htmlent.dtd"')
 
-    def test_namespaced_system_doctype(self):
+    def test_namespaced_public_doctype(self):
         # Test a namespaced doctype with a public id.
         self._test_doctype('xsl:stylesheet PUBLIC "htmlent.dtd"')