summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--beautifulsoup/builder/__init__.py71
-rw-r--r--beautifulsoup/builder/_html5lib.py2
-rw-r--r--beautifulsoup/builder/_lxml.py8
3 files changed, 66 insertions, 15 deletions
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index 4178730..522960a 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -7,6 +7,7 @@ __all__ = [
'HTMLTreeBuilder',
'SAXTreeBuilder',
'TreeBuilder',
+ 'TreeBuilderRegistry',
]
# Some useful features for a TreeBuilder to have.
@@ -15,8 +16,61 @@ PERMISSIVE = 'permissive'
XML = 'xml'
HTML = 'html'
-# Which builders have a given feature?
-treebuilders_for_feature = defaultdict(list)
+
+class TreeBuilderRegistry(object):
+
+ def __init__(self):
+ self.builders_for_feature = defaultdict(list)
+ self.builders = []
+
+ def register(self, treebuilder_class):
+ """Register a treebuilder based on its advertised features."""
+ for feature in treebuilder_class.features:
+ self.builders_for_feature[feature].insert(0, treebuilder_class)
+ self.builders.insert(0, treebuilder_class)
+
+ def lookup(self, *features):
+ if len(self.builders) == 0:
+ # There are no builders at all.
+ return None
+
+ if len(features) == 0:
+ # They didn't ask for any features. Give them the most
+ # recently registered builder.
+ return self.builders[0]
+
+ # Go down the list of features in order, and eliminate any builders
+ # that don't match every feature.
+ features = list(features)
+ features.reverse()
+ candidates = None
+ candidate_set = None
+ while len(features) > 0:
+ feature = features.pop()
+ we_have_the_feature = self.builders_for_feature.get(feature, [])
+ if len(we_have_the_feature) > 0:
+ if candidates is None:
+ candidates = we_have_the_feature
+ candidate_set = set(candidates)
+ else:
+ # Eliminate any candidates that don't have this feature.
+ candidate_set = candidate_set.intersection(
+ set(we_have_the_feature))
+
+ # The only valid candidates are the ones in candidate_set.
+ # Go through the original list of candidates and pick the first one
+ # that's in candidate_set.
+ if candidate_set is None:
+ return None
+ for candidate in candidates:
+ if candidate in candidate_set:
+ return candidate
+ return None
+
+# The BeautifulSoup class will take feature lists from developers and use them
+# to look up builders in this registry.
+registry = TreeBuilderRegistry()
+
class TreeBuilder(Entities):
"""Turn a document into a Beautiful Soup object tree."""
@@ -177,14 +231,8 @@ class HTMLTreeBuilder(TreeBuilder):
return False
-def register_treebuilder(treebuilder_class):
- """Register a treebuilder based on its advertised features."""
- for feature in treebuilder_class.features:
- treebuilders_for_feature[feature].append(treebuilder_class)
-
-
def register_treebuilders_from(module):
- """Copy TreeBuilder subclasses from the given module into this module."""
+ """Copy TreeBuilders from the given module into this module."""
# I'm fairly sure this is not the best way to do this.
this_module = sys.modules[__package__]
for name in module.__all__:
@@ -193,7 +241,8 @@ def register_treebuilders_from(module):
if issubclass(obj, TreeBuilder):
setattr(this_module, name, obj)
this_module.__all__.append(name)
- register_treebuilder(obj)
+ # Register the builder while we're at it.
+ this_module.registry.register(obj)
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want
@@ -210,5 +259,3 @@ try:
except ImportError:
# They don't have html5lib installed.
pass
-
-print treebuilders_for_feature
diff --git a/beautifulsoup/builder/_html5lib.py b/beautifulsoup/builder/_html5lib.py
index 395fb9d..175ea9b 100644
--- a/beautifulsoup/builder/_html5lib.py
+++ b/beautifulsoup/builder/_html5lib.py
@@ -20,7 +20,7 @@ from beautifulsoup.element import (
class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree."""
- features = [PERMISSIVE, HTML]
+ features = ['html5lib', PERMISSIVE, HTML]
def prepare_markup(self, markup, user_specified_encoding):
# Store the user-specified encoding for use later on.
diff --git a/beautifulsoup/builder/_lxml.py b/beautifulsoup/builder/_lxml.py
index f5cc242..4c7a826 100644
--- a/beautifulsoup/builder/_lxml.py
+++ b/beautifulsoup/builder/_lxml.py
@@ -9,15 +9,19 @@ from beautifulsoup.builder import (
FAST,
HTML,
HTMLTreeBuilder,
+ PERMISSIVE,
TreeBuilder,
XML)
from beautifulsoup.dammit import UnicodeDammit
import types
+LXML = 'lxml'
+
class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
- features = [FAST, XML]
+ # Well, it's permissive by XML parser standards.
+ features = [LXML, XML, FAST, PERMISSIVE]
@property
def default_parser(self):
@@ -86,7 +90,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
- features = [FAST, HTML]
+ features = [LXML, HTML, FAST]
@property
def default_parser(self):