diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-20 18:15:17 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-20 18:15:17 -0500 |
commit | 1fa18e957db92cfa056151f4e0d93c44243df1d9 (patch) | |
tree | 3a41d6a36fb55e6a98320eb5ab6f65a4e307e0e4 | |
parent | fe6a756f95d724456c368544949d41c16d0cc95b (diff) |
Started work on a tagging system that should make it easy to find a tree builder that meets your needs.
-rw-r--r-- | beautifulsoup/builder/__init__.py | 41 | ||||
-rw-r--r-- | beautifulsoup/builder/_html5lib.py | 9 |
2 files changed, 35 insertions, 15 deletions
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index 5e55f7f..0888cef 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -8,6 +8,14 @@ __all__ = [ 'TreeBuilder', ] +# Some useful keywords. +FAST = 'fast' +ACCURATE = 'accurate' +XML = 'xml' +HTML = 'html' + +builders_for_tag = {} + class TreeBuilder(Entities): """Turn a document into a Beautiful Soup object tree.""" @@ -166,19 +174,26 @@ class HTMLTreeBuilder(TreeBuilder): def register_builders_from(module): + """Copy everything in __all___ from the given module into this module.""" # I'm fairly sure this is not the best way to do this. - - # Copy everything mentioned in the builder module's __all__ into - # this module. this_module = sys.modules[__package__] for name in module.__all__: - setattr(this_module, name, getattr(module, name)) - - # Add all names from the builder module's __all__ to this module's - # __all__. - this_module.__all__ += module.__all__ - -import _lxml -register_builders_from(_lxml) -import _html5lib -register_builders_from(_html5lib) + obj = getattr(module, name) + setattr(this_module, name, obj) + this_module.__all__.append(name) + +# Builders are registered in reverse order of priority, so that custom +# builder registrations will take precedence. In general, we want +# html5lib to take precedence over lxml, because it's more reliable. +try: + import _lxml + register_builders_from(_lxml) +except ImportError: + # They don't have lxml installed. + pass +try: + import _html5lib + register_builders_from(_html5lib) +except ImportError: + # They don't have html5lib installed. + pass diff --git a/beautifulsoup/builder/_html5lib.py b/beautifulsoup/builder/_html5lib.py index 9cca0b0..020b7ea 100644 --- a/beautifulsoup/builder/_html5lib.py +++ b/beautifulsoup/builder/_html5lib.py @@ -2,7 +2,11 @@ __all__ = [ 'HTML5TreeBuilder', ] -from beautifulsoup.builder import HTMLTreeBuilder, SAXTreeBuilder +from beautifulsoup.builder import ( + ACCURATE, + HTML, + HTMLTreeBuilder, + ) import html5lib from html5lib.constants import DataLossWarning import warnings @@ -13,10 +17,11 @@ from beautifulsoup.element import ( Tag, ) - class HTML5TreeBuilder(HTMLTreeBuilder): """Use html5lib to build a tree.""" + tags = [ACCURATE, HTML] + def prepare_markup(self, markup, user_specified_encoding): # Store the user-specified encoding for use later on. self.user_specified_encoding = user_specified_encoding |