diff options
-rw-r--r-- | beautifulsoup/builder/__init__.py | 71 | ||||
-rw-r--r-- | beautifulsoup/builder/_html5lib.py | 2 | ||||
-rw-r--r-- | beautifulsoup/builder/_lxml.py | 8 |
3 files changed, 66 insertions, 15 deletions
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index 4178730..522960a 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -7,6 +7,7 @@ __all__ = [ 'HTMLTreeBuilder', 'SAXTreeBuilder', 'TreeBuilder', + 'TreeBuilderRegistry', ] # Some useful features for a TreeBuilder to have. @@ -15,8 +16,61 @@ PERMISSIVE = 'permissive' XML = 'xml' HTML = 'html' -# Which builders have a given feature? -treebuilders_for_feature = defaultdict(list) + +class TreeBuilderRegistry(object): + + def __init__(self): + self.builders_for_feature = defaultdict(list) + self.builders = [] + + def register(self, treebuilder_class): + """Register a treebuilder based on its advertised features.""" + for feature in treebuilder_class.features: + self.builders_for_feature[feature].insert(0, treebuilder_class) + self.builders.insert(0, treebuilder_class) + + def lookup(self, *features): + if len(self.builders) == 0: + # There are no builders at all. + return None + + if len(features) == 0: + # They didn't ask for any features. Give them the most + # recently registered builder. + return self.builders[0] + + # Go down the list of features in order, and eliminate any builders + # that don't match every feature. + features = list(features) + features.reverse() + candidates = None + candidate_set = None + while len(features) > 0: + feature = features.pop() + we_have_the_feature = self.builders_for_feature.get(feature, []) + if len(we_have_the_feature) > 0: + if candidates is None: + candidates = we_have_the_feature + candidate_set = set(candidates) + else: + # Eliminate any candidates that don't have this feature. + candidate_set = candidate_set.intersection( + set(we_have_the_feature)) + + # The only valid candidates are the ones in candidate_set. + # Go through the original list of candidates and pick the first one + # that's in candidate_set. + if candidate_set is None: + return None + for candidate in candidates: + if candidate in candidate_set: + return candidate + return None + +# The BeautifulSoup class will take feature lists from developers and use them +# to look up builders in this registry. +registry = TreeBuilderRegistry() + class TreeBuilder(Entities): """Turn a document into a Beautiful Soup object tree.""" @@ -177,14 +231,8 @@ class HTMLTreeBuilder(TreeBuilder): return False -def register_treebuilder(treebuilder_class): - """Register a treebuilder based on its advertised features.""" - for feature in treebuilder_class.features: - treebuilders_for_feature[feature].append(treebuilder_class) - - def register_treebuilders_from(module): - """Copy TreeBuilder subclasses from the given module into this module.""" + """Copy TreeBuilders from the given module into this module.""" # I'm fairly sure this is not the best way to do this. this_module = sys.modules[__package__] for name in module.__all__: @@ -193,7 +241,8 @@ def register_treebuilders_from(module): if issubclass(obj, TreeBuilder): setattr(this_module, name, obj) this_module.__all__.append(name) - register_treebuilder(obj) + # Register the builder while we're at it. + this_module.registry.register(obj) # Builders are registered in reverse order of priority, so that custom # builder registrations will take precedence. In general, we want @@ -210,5 +259,3 @@ try: except ImportError: # They don't have html5lib installed. pass - -print treebuilders_for_feature diff --git a/beautifulsoup/builder/_html5lib.py b/beautifulsoup/builder/_html5lib.py index 395fb9d..175ea9b 100644 --- a/beautifulsoup/builder/_html5lib.py +++ b/beautifulsoup/builder/_html5lib.py @@ -20,7 +20,7 @@ from beautifulsoup.element import ( class HTML5TreeBuilder(HTMLTreeBuilder): """Use html5lib to build a tree.""" - features = [PERMISSIVE, HTML] + features = ['html5lib', PERMISSIVE, HTML] def prepare_markup(self, markup, user_specified_encoding): # Store the user-specified encoding for use later on. diff --git a/beautifulsoup/builder/_lxml.py b/beautifulsoup/builder/_lxml.py index f5cc242..4c7a826 100644 --- a/beautifulsoup/builder/_lxml.py +++ b/beautifulsoup/builder/_lxml.py @@ -9,15 +9,19 @@ from beautifulsoup.builder import ( FAST, HTML, HTMLTreeBuilder, + PERMISSIVE, TreeBuilder, XML) from beautifulsoup.dammit import UnicodeDammit import types +LXML = 'lxml' + class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser - features = [FAST, XML] + # Well, it's permissive by XML parser standards. + features = [LXML, XML, FAST, PERMISSIVE] @property def default_parser(self): @@ -86,7 +90,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): - features = [FAST, HTML] + features = [LXML, HTML, FAST] @property def default_parser(self): |