summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2011-02-20 18:28:15 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2011-02-20 18:28:15 -0500
commit2f0b16b8ea82b09034dff0e747e29d89e97b5680 (patch)
treef35913725a819cdd0068180357468dee5bb89240
parent1fa18e957db92cfa056151f4e0d93c44243df1d9 (diff)
Tree builders now advertise their features.
-rw-r--r--beautifulsoup/builder/__init__.py33
-rw-r--r--beautifulsoup/builder/_html5lib.py7
-rw-r--r--beautifulsoup/builder/_lxml.py11
3 files changed, 38 insertions, 13 deletions
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index 0888cef..4178730 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
import re
import sys
from beautifulsoup.element import Entities
@@ -8,17 +9,20 @@ __all__ = [
'TreeBuilder',
]
-# Some useful keywords.
+# Some useful features for a TreeBuilder to have.
FAST = 'fast'
-ACCURATE = 'accurate'
+PERMISSIVE = 'permissive'
XML = 'xml'
HTML = 'html'
-builders_for_tag = {}
+# Which builders have a given feature?
+treebuilders_for_feature = defaultdict(list)
class TreeBuilder(Entities):
"""Turn a document into a Beautiful Soup object tree."""
+ features = []
+
assume_html = False
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
@@ -173,27 +177,38 @@ class HTMLTreeBuilder(TreeBuilder):
return False
-def register_builders_from(module):
- """Copy everything in __all___ from the given module into this module."""
+def register_treebuilder(treebuilder_class):
+ """Register a treebuilder based on its advertised features."""
+ for feature in treebuilder_class.features:
+ treebuilders_for_feature[feature].append(treebuilder_class)
+
+
+def register_treebuilders_from(module):
+ """Copy TreeBuilder subclasses from the given module into this module."""
# I'm fairly sure this is not the best way to do this.
this_module = sys.modules[__package__]
for name in module.__all__:
obj = getattr(module, name)
- setattr(this_module, name, obj)
- this_module.__all__.append(name)
+
+ if issubclass(obj, TreeBuilder):
+ setattr(this_module, name, obj)
+ this_module.__all__.append(name)
+ register_treebuilder(obj)
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want
# html5lib to take precedence over lxml, because it's more reliable.
try:
import _lxml
- register_builders_from(_lxml)
+ register_treebuilders_from(_lxml)
except ImportError:
# They don't have lxml installed.
pass
try:
import _html5lib
- register_builders_from(_html5lib)
+ register_treebuilders_from(_html5lib)
except ImportError:
# They don't have html5lib installed.
pass
+
+print treebuilders_for_feature
diff --git a/beautifulsoup/builder/_html5lib.py b/beautifulsoup/builder/_html5lib.py
index 020b7ea..395fb9d 100644
--- a/beautifulsoup/builder/_html5lib.py
+++ b/beautifulsoup/builder/_html5lib.py
@@ -3,7 +3,7 @@ __all__ = [
]
from beautifulsoup.builder import (
- ACCURATE,
+ PERMISSIVE,
HTML,
HTMLTreeBuilder,
)
@@ -20,7 +20,7 @@ from beautifulsoup.element import (
class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree."""
- tags = [ACCURATE, HTML]
+ features = [PERMISSIVE, HTML]
def prepare_markup(self, markup, user_specified_encoding):
# Store the user-specified encoding for use later on.
@@ -55,7 +55,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
def __init__(self, soup, namespaceHTMLElements):
self.soup = soup
if namespaceHTMLElements:
- warnings.warn("namespaceHTMLElements not supported yet", DataLossWarning)
+ warnings.warn("namespaceHTMLElements not supported yet",
+ DataLossWarning)
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
def documentClass(self):
diff --git a/beautifulsoup/builder/_lxml.py b/beautifulsoup/builder/_lxml.py
index c2f368c..f5cc242 100644
--- a/beautifulsoup/builder/_lxml.py
+++ b/beautifulsoup/builder/_lxml.py
@@ -5,13 +5,20 @@ __all__ = [
from lxml import etree
from beautifulsoup.element import Comment, Doctype
-from beautifulsoup.builder import TreeBuilder, HTMLTreeBuilder
+from beautifulsoup.builder import (
+ FAST,
+ HTML,
+ HTMLTreeBuilder,
+ TreeBuilder,
+ XML)
from beautifulsoup.dammit import UnicodeDammit
import types
class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
+ features = [FAST, XML]
+
@property
def default_parser(self):
# This can either return a parser object or a class, which
@@ -79,6 +86,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
+ features = [FAST, HTML]
+
@property
def default_parser(self):
return etree.HTMLParser