Added a registry for tree builders and made it possible to find a tree builder that has the features you want from the BeautifulSoup constructor.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-20 19:50:45 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-20 19:50:45 -0500
commit: 2fa73e2cb99b0816148ade6150f378993907534e (patch)
tree: cf7e2371881c680990157cae621f6045f5941f56
parent: e6320fad4cd162ab6c7dfe02be5206f5c3f8c25b (diff)
parent: ce3742abd4c7fe39247569e82e2b3acdd6052bb1 (diff)
10 files changed, 254 insertions, 69 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 3fb4f36..c9a4ca7 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -21,6 +21,11 @@ Some attributes have also been renamed:
 
  * Tag.isSelfClosing -> Tag.is_empty_element
 
+So have some arguments to popular methods:
+
+ * BeautifulSoup(parseOnlyThese=...) -> BeautifulSoup(parse_only=...)
+ * BeautifulSoup(fromEncoding=...) -> BeautifulSoup(from_encoding=...)
+
 == Generators are now properties ==
 
 The generators have been given more sensible (and PEP 8-compliant)
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index 968be08..c998924 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -3,34 +3,14 @@ Elixir and Tonic
 "The Screen-Scraper's Friend"
 http://www.crummy.com/software/BeautifulSoup/
 
-Beautiful Soup parses a (possibly invalid) XML or HTML document into a
-tree representation. It provides methods and Pythonic idioms that make
-it easy to navigate, search, and modify the tree.
+Beautiful Soup uses a plug-in parser to parse a (possibly invalid) XML
+or HTML document into a tree representation. The parser does the work
+of building a parse tree, and Beautiful Soup provides provides methods
+and Pythonic idioms that make it easy to navigate, search, and modify
+the parse tree.
 
-A well-formed XML/HTML document yields a well-formed data
-structure. An ill-formed XML/HTML document yields a correspondingly
-ill-formed data structure. If your document is only locally
-well-formed, you can use this library to find and process the
-well-formed part of it.
-
-Beautiful Soup works with Python 2.2 and up. It has no external
-dependencies, but you'll have more success at converting data to UTF-8
-if you also install these three packages:
-
-* chardet, for auto-detecting character encodings
-  http://chardet.feedparser.org/
-* cjkcodecs and iconv_codec, which add more encodings to the ones supported
-  by stock Python.
-  http://cjkpython.i18n.org/
-
-Beautiful Soup defines classes for two main parsing strategies:
-
- * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
-   language that kind of looks like XML.
-
- * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
-   or invalid. This class has web browser-like heuristics for
-   obtaining a sensible parse tree in the face of common HTML errors.
+Beautiful Soup works with Python 2.5 and up. To get it to work, you
+must install either lxml or html5lib.
 
 For more than you ever wanted to know about Beautiful Soup, see the
 documentation:
@@ -38,7 +18,7 @@ http://www.crummy.com/software/BeautifulSoup/documentation.html
 
 Here, have some legalese:
 
-Copyright (c) 2004-2009, Leonard Richardson
+Copyright (c) 2004-2011, Leonard Richardson
 
 All rights reserved.
 
@@ -84,6 +64,7 @@ __all__ = ['BeautifulSoup']
 import re
 
 from util import isList, isString, buildSet
+from builder import builder_registry
 from dammit import UnicodeDammit
 from element import Entities, NavigableString, Tag
 
@@ -112,40 +93,45 @@ class BeautifulSoup(Tag):
     """
     ROOT_TAG_NAME = u'[document]'
 
+    # If the end-user gives no indication which tree builder they
+    # want, look for one with these features.
+    DEFAULT_BUILDER_FEATURES = ['html']
+
     # Used when determining whether a text node is all whitespace and
     # can be replaced with a single space. A text node that contains
     # fancy Unicode spaces (usually non-breaking) should be left
     # alone.
     STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
 
-    @classmethod
-    def default_builder(self):
-        try:
-            from builder import HTML5TreeBuilder
-            return HTML5TreeBuilder()
-        except ImportError:
-            from builder import LXMLTreeBuilder
-            return LXMLTreeBuilder()
-
-    def __init__(self, markup="", builder=None, parseOnlyThese=None,
-                 fromEncoding=None):
+    def __init__(self, markup="", features=None, builder=None,
+                 parse_only=None, from_encoding=None):
         """The Soup object is initialized as the 'root tag', and the
         provided markup (which can be a string or a file-like object)
         is fed into the underlying parser."""
 
         if builder is None:
-            builder = self.default_builder()
+            if isinstance(features, basestring):
+                features = [features]
+            if len(features) == 0:
+                features = self.DEFAULT_BUILDER_FEATURES
+            builder_class = builder_registry.lookup(*features)
+            if builder_class is None:
+                raise ValueError(
+                    "Couldn't find a tree builder with the features you "
+                    "requested: %s. Do you need to install a parser library?"
+                    % ",".join(features))
+            builder = builder_class()
         self.builder = builder
         self.builder.soup = self
 
-        self.parseOnlyThese = parseOnlyThese
+        self.parse_only = parse_only
 
         self.reset()
 
         if hasattr(markup, 'read'):        # It's a file-type object.
             markup = markup.read()
         self.markup, self.original_encoding, self.declared_html_encoding = (
-            self.builder.prepare_markup(markup, fromEncoding))
+            self.builder.prepare_markup(markup, from_encoding))
 
         try:
             self._feed()
@@ -201,9 +187,9 @@ class BeautifulSoup(Tag):
                 else:
                     currentData = ' '
             self.currentData = []
-            if self.parseOnlyThese and len(self.tagStack) <= 1 and \
-                   (not self.parseOnlyThese.text or \
-                    not self.parseOnlyThese.search(currentData)):
+            if self.parse_only and len(self.tagStack) <= 1 and \
+                   (not self.parse_only.text or \
+                    not self.parse_only.search(currentData)):
                 return
             o = containerClass(currentData)
             self.object_was_parsed(o)
@@ -251,9 +237,9 @@ class BeautifulSoup(Tag):
         #print "Start tag %s: %s" % (name, attrs)
         self.endData()
 
-        if (self.parseOnlyThese and len(self.tagStack) <= 1
-            and (self.parseOnlyThese.text
-                 or not self.parseOnlyThese.searchTag(name, attrs))):
+        if (self.parse_only and len(self.tagStack) <= 1
+            and (self.parse_only.text
+                 or not self.parse_only.searchTag(name, attrs))):
             return None
 
         tag = Tag(self, self.builder, name, attrs, self.currentTag,
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index 0888cef..b97c5f9 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 import re
 import sys
 from beautifulsoup.element import Entities
@@ -6,19 +7,77 @@ __all__ = [
     'HTMLTreeBuilder',
     'SAXTreeBuilder',
     'TreeBuilder',
+    'TreeBuilderRegistry',
     ]
 
-# Some useful keywords.
+# Some useful features for a TreeBuilder to have.
 FAST = 'fast'
-ACCURATE = 'accurate'
+PERMISSIVE = 'permissive'
 XML = 'xml'
 HTML = 'html'
+HTML_5 = 'html5'
+
+
+class TreeBuilderRegistry(object):
+
+    def __init__(self):
+        self.builders_for_feature = defaultdict(list)
+        self.builders = []
+
+    def register(self, treebuilder_class):
+        """Register a treebuilder based on its advertised features."""
+        for feature in treebuilder_class.features:
+            self.builders_for_feature[feature].insert(0, treebuilder_class)
+        self.builders.insert(0, treebuilder_class)
+
+    def lookup(self, *features):
+        if len(self.builders) == 0:
+            # There are no builders at all.
+            return None
+
+        if len(features) == 0:
+            # They didn't ask for any features. Give them the most
+            # recently registered builder.
+            return self.builders[0]
+
+        # Go down the list of features in order, and eliminate any builders
+        # that don't match every feature.
+        features = list(features)
+        features.reverse()
+        candidates = None
+        candidate_set = None
+        while len(features) > 0:
+            feature = features.pop()
+            we_have_the_feature = self.builders_for_feature.get(feature, [])
+            if len(we_have_the_feature) > 0:
+                if candidates is None:
+                    candidates = we_have_the_feature
+                    candidate_set = set(candidates)
+                else:
+                    # Eliminate any candidates that don't have this feature.
+                    candidate_set = candidate_set.intersection(
+                        set(we_have_the_feature))
+
+        # The only valid candidates are the ones in candidate_set.
+        # Go through the original list of candidates and pick the first one
+        # that's in candidate_set.
+        if candidate_set is None:
+            return None
+        for candidate in candidates:
+            if candidate in candidate_set:
+                return candidate
+        return None
+
+# The BeautifulSoup class will take feature lists from developers and use them
+# to look up builders in this registry.
+builder_registry = TreeBuilderRegistry()
 
-builders_for_tag = {}
 
 class TreeBuilder(Entities):
     """Turn a document into a Beautiful Soup object tree."""
 
+    features = []
+
     assume_html = False
     preserve_whitespace_tags = set()
     empty_element_tags = None # A tag will be considered an empty-element
@@ -150,7 +209,7 @@ class HTMLTreeBuilder(TreeBuilder):
             match = self.CHARSET_RE.search(content)
             if match:
                 if (self.soup.declared_html_encoding is not None or
-                    self.soup.original_encoding == self.soup.fromEncoding):
+                    self.soup.original_encoding == self.soup.from_encoding):
                     # An HTML encoding was sniffed while converting
                     # the document to Unicode, or an HTML encoding was
                     # sniffed during a previous pass through the
@@ -173,27 +232,31 @@ class HTMLTreeBuilder(TreeBuilder):
         return False
 
 
-def register_builders_from(module):
-    """Copy everything in __all___ from the given module into this module."""
+def register_treebuilders_from(module):
+    """Copy TreeBuilders from the given module into this module."""
     # I'm fairly sure this is not the best way to do this.
     this_module = sys.modules[__package__]
     for name in module.__all__:
         obj = getattr(module, name)
-        setattr(this_module, name, obj)
-        this_module.__all__.append(name)
+
+        if issubclass(obj, TreeBuilder):
+            setattr(this_module, name, obj)
+            this_module.__all__.append(name)
+            # Register the builder while we're at it.
+            this_module.builder_registry.register(obj)
 
 # Builders are registered in reverse order of priority, so that custom
 # builder registrations will take precedence. In general, we want
 # html5lib to take precedence over lxml, because it's more reliable.
 try:
     import _lxml
-    register_builders_from(_lxml)
+    register_treebuilders_from(_lxml)
 except ImportError:
     # They don't have lxml installed.
     pass
 try:
     import _html5lib
-    register_builders_from(_html5lib)
+    register_treebuilders_from(_html5lib)
 except ImportError:
     # They don't have html5lib installed.
     pass
diff --git a/beautifulsoup/builder/_html5lib.py b/beautifulsoup/builder/_html5lib.py
index 020b7ea..f8a7a40 100644
--- a/beautifulsoup/builder/_html5lib.py
+++ b/beautifulsoup/builder/_html5lib.py
@@ -3,8 +3,9 @@ __all__ = [
     ]
 
 from beautifulsoup.builder import (
-    ACCURATE,
+    PERMISSIVE,
     HTML,
+    HTML_5,
     HTMLTreeBuilder,
     )
 import html5lib
@@ -20,7 +21,7 @@ from beautifulsoup.element import (
 class HTML5TreeBuilder(HTMLTreeBuilder):
     """Use html5lib to build a tree."""
 
-    tags = [ACCURATE, HTML]
+    features = ['html5lib', PERMISSIVE, HTML_5, HTML]
 
     def prepare_markup(self, markup, user_specified_encoding):
         # Store the user-specified encoding for use later on.
@@ -55,7 +56,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
     def __init__(self, soup, namespaceHTMLElements):
         self.soup = soup
         if namespaceHTMLElements:
-            warnings.warn("namespaceHTMLElements not supported yet", DataLossWarning)
+            warnings.warn("namespaceHTMLElements not supported yet",
+                          DataLossWarning)
         super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
 
     def documentClass(self):
diff --git a/beautifulsoup/builder/_lxml.py b/beautifulsoup/builder/_lxml.py
index c2f368c..4c7a826 100644
--- a/beautifulsoup/builder/_lxml.py
+++ b/beautifulsoup/builder/_lxml.py
@@ -5,13 +5,24 @@ __all__ = [
 
 from lxml import etree
 from beautifulsoup.element import Comment, Doctype
-from beautifulsoup.builder import TreeBuilder, HTMLTreeBuilder
+from beautifulsoup.builder import (
+    FAST,
+    HTML,
+    HTMLTreeBuilder,
+    PERMISSIVE,
+    TreeBuilder,
+    XML)
 from beautifulsoup.dammit import UnicodeDammit
 import types
 
+LXML = 'lxml'
+
 class LXMLTreeBuilderForXML(TreeBuilder):
     DEFAULT_PARSER_CLASS = etree.XMLParser
 
+    # Well, it's permissive by XML parser standards.
+    features = [LXML, XML, FAST, PERMISSIVE]
+
     @property
     def default_parser(self):
         # This can either return a parser object or a class, which
@@ -79,6 +90,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
 
 class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
 
+    features = [LXML, HTML, FAST]
+
     @property
     def default_parser(self):
         return etree.HTMLParser
diff --git a/tests/test_builder_registry.py b/tests/test_builder_registry.py
new file mode 100644
index 0000000..17e3fb1
--- /dev/null
+++ b/tests/test_builder_registry.py
@@ -0,0 +1,115 @@
+"""Tests of the builder registry."""
+
+import unittest
+
+from beautifulsoup import BeautifulSoup
+from beautifulsoup.builder import (
+    builder_registry as registry,
+    LXMLTreeBuilderForXML,
+    LXMLTreeBuilder,
+    TreeBuilderRegistry,
+    HTML5TreeBuilder
+)
+
+
+
+class BuiltInRegistryTest(unittest.TestCase):
+    """Test the built-in registry with the default builders registered."""
+
+    def test_combination(self):
+        self.assertEquals(registry.lookup('fast', 'html'),
+                          LXMLTreeBuilder)
+        self.assertEquals(registry.lookup('permissive', 'xml'),
+                          LXMLTreeBuilderForXML)
+        self.assertEquals(registry.lookup('permissive', 'html'),
+                          HTML5TreeBuilder)
+
+    def test_lookup_by_markup_type(self):
+        self.assertEquals(registry.lookup('html'), HTML5TreeBuilder)
+        self.assertEquals(registry.lookup('xml'), LXMLTreeBuilderForXML)
+
+    def test_named_library(self):
+        self.assertEquals(registry.lookup('lxml', 'xml'),
+                          LXMLTreeBuilderForXML)
+        self.assertEquals(registry.lookup('lxml', 'html'),
+                          LXMLTreeBuilder)
+        self.assertEquals(registry.lookup('html5lib'),
+                          HTML5TreeBuilder)
+
+    def test_unimplemented_combinations(self):
+        self.assertEquals(registry.lookup('fast', 'permissive', 'html'),
+                          None)
+
+    def test_beautifulsoup_constructor_does_lookup(self):
+        # You can pass in a string.
+        BeautifulSoup("", features="html")
+        # Or a list of strings.
+        BeautifulSoup("", features=["html", "permissive"])
+
+        # You'll get an exception if BS can't find an appropriate
+        # builder.
+        self.assertRaises(ValueError, BeautifulSoup,
+                          "", features="no-such-feature")
+
+class RegistryTest(unittest.TestCase):
+    """Test the TreeBuilderRegistry class in general."""
+
+    def setUp(self):
+        self.registry = TreeBuilderRegistry()
+
+    def builder_for_features(self, *feature_list):
+        cls = type('Builder_' + '_'.join(feature_list),
+                   (object,), {'features' : feature_list})
+
+        self.registry.register(cls)
+        return cls
+
+    def test_register_with_no_features(self):
+        builder = self.builder_for_features()
+
+        # Since the builder advertises no features, you can't find it
+        # by looking up features.
+        self.assertEquals(self.registry.lookup('foo'), None)
+
+        # But you can find it by doing a lookup with no features, if
+        # this happens to be the only registered builder.
+        self.assertEquals(self.registry.lookup(), builder)
+
+    def test_register_with_features_makes_lookup_succeed(self):
+        builder = self.builder_for_features('foo', 'bar')
+        self.assertEquals(self.registry.lookup('foo'), builder)
+        self.assertEquals(self.registry.lookup('bar'), builder)
+
+    def test_lookup_fails_when_no_builder_implements_feature(self):
+        builder = self.builder_for_features('foo', 'bar')
+        self.assertEquals(self.registry.lookup('baz'), None)
+
+    def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
+        builder1 = self.builder_for_features('foo')
+        builder2 = self.builder_for_features('bar')
+        self.assertEquals(self.registry.lookup(), builder2)
+
+    def test_lookup_fails_when_no_tree_builders_registered(self):
+        self.assertEquals(self.registry.lookup(), None)
+
+    def test_lookup_gets_most_recent_builder_supporting_all_features(self):
+        has_one = self.builder_for_features('foo')
+        has_the_other = self.builder_for_features('bar')
+        has_both_early = self.builder_for_features('foo', 'bar', 'baz')
+        has_both_late = self.builder_for_features('foo', 'bar', 'quux')
+        lacks_one = self.builder_for_features('bar')
+        has_the_other = self.builder_for_features('foo')
+
+        # There are two builders featuring 'foo' and 'bar', but
+        # the one that also features 'quux' was registered later.
+        self.assertEquals(self.registry.lookup('foo', 'bar'),
+                          has_both_late)
+
+        # There is only one builder featuring 'foo', 'bar', and 'baz'.
+        self.assertEquals(self.registry.lookup('foo', 'bar', 'baz'),
+                          has_both_early)
+
+    def test_lookup_fails_when_cannot_reconcile_requested_features(self):
+        builder1 = self.builder_for_features('foo', 'bar')
+        builder2 = self.builder_for_features('foo', 'baz')
+        self.assertEquals(self.registry.lookup('bar', 'baz'), None)
diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py
index aa0bad2..5abc29d 100644
--- a/tests/test_html5lib.py
+++ b/tests/test_html5lib.py
@@ -18,7 +18,7 @@ class TestHTML5Builder(TestLXMLBuilder):
         strainer = SoupStrainer("b")
         markup = "<p>A <b>bold</b> statement.</p>"
         soup = self.soup(markup,
-                         parseOnlyThese=strainer)
+                         parse_only=strainer)
         self.assertEquals(
             soup.decode(), self.document_for(markup))
 
@@ -210,7 +210,7 @@ class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion):
         # A real-world test to make sure we can convert ISO-8859-9 (a
         # Hebrew encoding) to UTF-8.
         soup = self.soup(self.HEBREW_DOCUMENT,
-                         fromEncoding="iso-8859-8")
+                         from_encoding="iso-8859-8")
         self.assertEquals(soup.original_encoding, 'iso8859-8')
         self.assertEquals(
             soup.encode('utf-8'),
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 9d08aef..df2f341 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -325,7 +325,7 @@ class TestLXMLBuilder(SoupTest):
     def test_soupstrainer(self):
         strainer = SoupStrainer("b")
         soup = self.soup("A <b>bold</b> <meta /> <i>statement</i>",
-                         parseOnlyThese=strainer)
+                         parse_only=strainer)
         self.assertEquals(soup.decode(), "<b>bold</b>")
 
 
@@ -506,7 +506,7 @@ class TestLXMLBuilderEncodingConversion(SoupTest):
         # A real-world test to make sure we can convert ISO-8859-9 (a
         # Hebrew encoding) to UTF-8.
         soup = self.soup(self.HEBREW_DOCUMENT,
-                         fromEncoding="iso-8859-8")
+                         from_encoding="iso-8859-8")
         self.assertEquals(soup.original_encoding, 'iso-8859-8')
         self.assertEquals(
             soup.encode('utf-8'),
diff --git a/tests/test_soup.py b/tests/test_soup.py
index 01dff53..bb2262a 100644
--- a/tests/test_soup.py
+++ b/tests/test_soup.py
@@ -12,7 +12,7 @@ class TestSelectiveParsing(SoupTest):
     def test_parse_with_soupstrainer(self):
         markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
         strainer = SoupStrainer("b")
-        soup = self.soup(markup, parseOnlyThese=strainer)
+        soup = self.soup(markup, parse_only=strainer)
         self.assertEquals(soup.encode(), "<b>Yes</b><b>Yes <c>Yes</c></b>")
 
 
diff --git a/tests/test_tree.py b/tests/test_tree.py
index 384d518..0b3d72e 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -13,6 +13,7 @@ import copy
 import cPickle as pickle
 import re
 from beautifulsoup import BeautifulSoup
+from beautifulsoup.builder import builder_registry
 from beautifulsoup.element import CData, SoupStrainer, Tag
 from beautifulsoup.testing import SoupTest
 
@@ -523,7 +524,7 @@ class TestTreeModification(SoupTest):
         self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>'))
 
     def test_new_tag_creation(self):
-        builder = BeautifulSoup.default_builder()
+        builder = builder_registry.lookup('html5lib')()
         soup = self.soup("<body></body>", builder=builder)
         a = Tag(soup, builder, 'a')
         ol = Tag(soup, builder, 'ol')
@@ -863,7 +864,7 @@ class TestSubstitutions(SoupTest):
         # meta tag got filtered out by the strainer. This test makes
         # sure that doesn't happen.
         strainer = SoupStrainer('pre')
-        soup = self.soup(markup, parseOnlyThese=strainer)
+        soup = self.soup(markup, parse_only=strainer)
         self.assertEquals(soup.contents[0].name, 'pre')
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-20 19:50:45 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-20 19:50:45 -0500
commit	2fa73e2cb99b0816148ade6150f378993907534e (patch)
tree	cf7e2371881c680990157cae621f6045f5941f56
parent	e6320fad4cd162ab6c7dfe02be5206f5c3f8c25b (diff)
parent	ce3742abd4c7fe39247569e82e2b3acdd6052bb1 (diff)