diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-20 18:28:15 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-20 18:28:15 -0500 |
commit | 2f0b16b8ea82b09034dff0e747e29d89e97b5680 (patch) | |
tree | f35913725a819cdd0068180357468dee5bb89240 | |
parent | 1fa18e957db92cfa056151f4e0d93c44243df1d9 (diff) |
Tree builders now advertise their features.
-rw-r--r-- | beautifulsoup/builder/__init__.py | 33 | ||||
-rw-r--r-- | beautifulsoup/builder/_html5lib.py | 7 | ||||
-rw-r--r-- | beautifulsoup/builder/_lxml.py | 11 |
3 files changed, 38 insertions, 13 deletions
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index 0888cef..4178730 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -1,3 +1,4 @@ +from collections import defaultdict import re import sys from beautifulsoup.element import Entities @@ -8,17 +9,20 @@ __all__ = [ 'TreeBuilder', ] -# Some useful keywords. +# Some useful features for a TreeBuilder to have. FAST = 'fast' -ACCURATE = 'accurate' +PERMISSIVE = 'permissive' XML = 'xml' HTML = 'html' -builders_for_tag = {} +# Which builders have a given feature? +treebuilders_for_feature = defaultdict(list) class TreeBuilder(Entities): """Turn a document into a Beautiful Soup object tree.""" + features = [] + assume_html = False preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element @@ -173,27 +177,38 @@ class HTMLTreeBuilder(TreeBuilder): return False -def register_builders_from(module): - """Copy everything in __all___ from the given module into this module.""" +def register_treebuilder(treebuilder_class): + """Register a treebuilder based on its advertised features.""" + for feature in treebuilder_class.features: + treebuilders_for_feature[feature].append(treebuilder_class) + + +def register_treebuilders_from(module): + """Copy TreeBuilder subclasses from the given module into this module.""" # I'm fairly sure this is not the best way to do this. this_module = sys.modules[__package__] for name in module.__all__: obj = getattr(module, name) - setattr(this_module, name, obj) - this_module.__all__.append(name) + + if issubclass(obj, TreeBuilder): + setattr(this_module, name, obj) + this_module.__all__.append(name) + register_treebuilder(obj) # Builders are registered in reverse order of priority, so that custom # builder registrations will take precedence. In general, we want # html5lib to take precedence over lxml, because it's more reliable. try: import _lxml - register_builders_from(_lxml) + register_treebuilders_from(_lxml) except ImportError: # They don't have lxml installed. pass try: import _html5lib - register_builders_from(_html5lib) + register_treebuilders_from(_html5lib) except ImportError: # They don't have html5lib installed. pass + +print treebuilders_for_feature diff --git a/beautifulsoup/builder/_html5lib.py b/beautifulsoup/builder/_html5lib.py index 020b7ea..395fb9d 100644 --- a/beautifulsoup/builder/_html5lib.py +++ b/beautifulsoup/builder/_html5lib.py @@ -3,7 +3,7 @@ __all__ = [ ] from beautifulsoup.builder import ( - ACCURATE, + PERMISSIVE, HTML, HTMLTreeBuilder, ) @@ -20,7 +20,7 @@ from beautifulsoup.element import ( class HTML5TreeBuilder(HTMLTreeBuilder): """Use html5lib to build a tree.""" - tags = [ACCURATE, HTML] + features = [PERMISSIVE, HTML] def prepare_markup(self, markup, user_specified_encoding): # Store the user-specified encoding for use later on. @@ -55,7 +55,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): def __init__(self, soup, namespaceHTMLElements): self.soup = soup if namespaceHTMLElements: - warnings.warn("namespaceHTMLElements not supported yet", DataLossWarning) + warnings.warn("namespaceHTMLElements not supported yet", + DataLossWarning) super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) def documentClass(self): diff --git a/beautifulsoup/builder/_lxml.py b/beautifulsoup/builder/_lxml.py index c2f368c..f5cc242 100644 --- a/beautifulsoup/builder/_lxml.py +++ b/beautifulsoup/builder/_lxml.py @@ -5,13 +5,20 @@ __all__ = [ from lxml import etree from beautifulsoup.element import Comment, Doctype -from beautifulsoup.builder import TreeBuilder, HTMLTreeBuilder +from beautifulsoup.builder import ( + FAST, + HTML, + HTMLTreeBuilder, + TreeBuilder, + XML) from beautifulsoup.dammit import UnicodeDammit import types class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser + features = [FAST, XML] + @property def default_parser(self): # This can either return a parser object or a class, which @@ -79,6 +86,8 @@ class LXMLTreeBuilderForXML(TreeBuilder): class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): + features = [FAST, HTML] + @property def default_parser(self): return etree.HTMLParser |