Tree builders now advertise their features.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-20 18:28:15 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-20 18:28:15 -0500
commit: 2f0b16b8ea82b09034dff0e747e29d89e97b5680 (patch)
tree: f35913725a819cdd0068180357468dee5bb89240
parent: 1fa18e957db92cfa056151f4e0d93c44243df1d9 (diff)
3 files changed, 38 insertions, 13 deletions
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index 0888cef..4178730 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 import re
 import sys
 from beautifulsoup.element import Entities
@@ -8,17 +9,20 @@ __all__ = [
     'TreeBuilder',
     ]
 
-# Some useful keywords.
+# Some useful features for a TreeBuilder to have.
 FAST = 'fast'
-ACCURATE = 'accurate'
+PERMISSIVE = 'permissive'
 XML = 'xml'
 HTML = 'html'
 
-builders_for_tag = {}
+# Which builders have a given feature?
+treebuilders_for_feature = defaultdict(list)
 
 class TreeBuilder(Entities):
     """Turn a document into a Beautiful Soup object tree."""
 
+    features = []
+
     assume_html = False
     preserve_whitespace_tags = set()
     empty_element_tags = None # A tag will be considered an empty-element
@@ -173,27 +177,38 @@ class HTMLTreeBuilder(TreeBuilder):
         return False
 
 
-def register_builders_from(module):
-    """Copy everything in __all___ from the given module into this module."""
+def register_treebuilder(treebuilder_class):
+    """Register a treebuilder based on its advertised features."""
+    for feature in treebuilder_class.features:
+        treebuilders_for_feature[feature].append(treebuilder_class)
+
+
+def register_treebuilders_from(module):
+    """Copy TreeBuilder subclasses from the given module into this module."""
     # I'm fairly sure this is not the best way to do this.
     this_module = sys.modules[__package__]
     for name in module.__all__:
         obj = getattr(module, name)
-        setattr(this_module, name, obj)
-        this_module.__all__.append(name)
+
+        if issubclass(obj, TreeBuilder):
+            setattr(this_module, name, obj)
+            this_module.__all__.append(name)
+            register_treebuilder(obj)
 
 # Builders are registered in reverse order of priority, so that custom
 # builder registrations will take precedence. In general, we want
 # html5lib to take precedence over lxml, because it's more reliable.
 try:
     import _lxml
-    register_builders_from(_lxml)
+    register_treebuilders_from(_lxml)
 except ImportError:
     # They don't have lxml installed.
     pass
 try:
     import _html5lib
-    register_builders_from(_html5lib)
+    register_treebuilders_from(_html5lib)
 except ImportError:
     # They don't have html5lib installed.
     pass
+
+print treebuilders_for_feature
diff --git a/beautifulsoup/builder/_html5lib.py b/beautifulsoup/builder/_html5lib.py
index 020b7ea..395fb9d 100644
--- a/beautifulsoup/builder/_html5lib.py
+++ b/beautifulsoup/builder/_html5lib.py
@@ -3,7 +3,7 @@ __all__ = [
     ]
 
 from beautifulsoup.builder import (
-    ACCURATE,
+    PERMISSIVE,
     HTML,
     HTMLTreeBuilder,
     )
@@ -20,7 +20,7 @@ from beautifulsoup.element import (
 class HTML5TreeBuilder(HTMLTreeBuilder):
     """Use html5lib to build a tree."""
 
-    tags = [ACCURATE, HTML]
+    features = [PERMISSIVE, HTML]
 
     def prepare_markup(self, markup, user_specified_encoding):
         # Store the user-specified encoding for use later on.
@@ -55,7 +55,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
     def __init__(self, soup, namespaceHTMLElements):
         self.soup = soup
         if namespaceHTMLElements:
-            warnings.warn("namespaceHTMLElements not supported yet", DataLossWarning)
+            warnings.warn("namespaceHTMLElements not supported yet",
+                          DataLossWarning)
         super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
 
     def documentClass(self):
diff --git a/beautifulsoup/builder/_lxml.py b/beautifulsoup/builder/_lxml.py
index c2f368c..f5cc242 100644
--- a/beautifulsoup/builder/_lxml.py
+++ b/beautifulsoup/builder/_lxml.py
@@ -5,13 +5,20 @@ __all__ = [
 
 from lxml import etree
 from beautifulsoup.element import Comment, Doctype
-from beautifulsoup.builder import TreeBuilder, HTMLTreeBuilder
+from beautifulsoup.builder import (
+    FAST,
+    HTML,
+    HTMLTreeBuilder,
+    TreeBuilder,
+    XML)
 from beautifulsoup.dammit import UnicodeDammit
 import types
 
 class LXMLTreeBuilderForXML(TreeBuilder):
     DEFAULT_PARSER_CLASS = etree.XMLParser
 
+    features = [FAST, XML]
+
     @property
     def default_parser(self):
         # This can either return a parser object or a class, which
@@ -79,6 +86,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
 
 class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
 
+    features = [FAST, HTML]
+
     @property
     def default_parser(self):
         return etree.HTMLParser
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-20 18:28:15 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-20 18:28:15 -0500
commit	2f0b16b8ea82b09034dff0e747e29d89e97b5680 (patch)
tree	f35913725a819cdd0068180357468dee5bb89240
parent	1fa18e957db92cfa056151f4e0d93c44243df1d9 (diff)