diff options
-rw-r--r-- | beautifulsoup/__init__.py | 27 | ||||
-rw-r--r-- | beautifulsoup/builder/__init__.py | 1 | ||||
-rw-r--r-- | beautifulsoup/builder/_html5lib.py | 3 | ||||
-rw-r--r-- | tests/test_tree.py | 3 |
4 files changed, 20 insertions, 14 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py index b8598e2..93a610a 100644 --- a/beautifulsoup/__init__.py +++ b/beautifulsoup/__init__.py @@ -64,6 +64,7 @@ __all__ = ['BeautifulSoup'] import re from util import isList, isString, buildSet +from builder import registry from dammit import UnicodeDammit from element import Entities, NavigableString, Tag @@ -92,29 +93,31 @@ class BeautifulSoup(Tag): """ ROOT_TAG_NAME = u'[document]' + # If the end-user gives no indication which tree builder they + # want, look for one with these features. + DEFAULT_BUILDER_FEATURES = ['html'] + # Used when determining whether a text node is all whitespace and # can be replaced with a single space. A text node that contains # fancy Unicode spaces (usually non-breaking) should be left # alone. STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } - @classmethod - def default_builder(self): - try: - from builder import HTML5TreeBuilder - return HTML5TreeBuilder() - except ImportError: - from builder import LXMLTreeBuilder - return LXMLTreeBuilder() - - def __init__(self, markup="", builder=None, parse_only=None, - from_encoding=None): + def __init__(self, markup="", parse_only=None, from_encoding=None, + builder=None, *features): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser.""" if builder is None: - builder = self.default_builder() + if len(features) == 0: + features = self.DEFAULT_BUILDER_FEATURES + builder = registry.lookup(*features) + if builder is None: + raise ValueError( + "Couldn't find a tree builder with the features you " + "requested: %s. Do you need to install a parser library?" + % ",".join(features)) self.builder = builder self.builder.soup = self diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index 854cc56..385dd50 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -15,6 +15,7 @@ FAST = 'fast' PERMISSIVE = 'permissive' XML = 'xml' HTML = 'html' +HTML_5 = 'html5' class TreeBuilderRegistry(object): diff --git a/beautifulsoup/builder/_html5lib.py b/beautifulsoup/builder/_html5lib.py index 175ea9b..f8a7a40 100644 --- a/beautifulsoup/builder/_html5lib.py +++ b/beautifulsoup/builder/_html5lib.py @@ -5,6 +5,7 @@ __all__ = [ from beautifulsoup.builder import ( PERMISSIVE, HTML, + HTML_5, HTMLTreeBuilder, ) import html5lib @@ -20,7 +21,7 @@ from beautifulsoup.element import ( class HTML5TreeBuilder(HTMLTreeBuilder): """Use html5lib to build a tree.""" - features = ['html5lib', PERMISSIVE, HTML] + features = ['html5lib', PERMISSIVE, HTML_5, HTML] def prepare_markup(self, markup, user_specified_encoding): # Store the user-specified encoding for use later on. diff --git a/tests/test_tree.py b/tests/test_tree.py index cefdf4a..0232bac 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -13,6 +13,7 @@ import copy import cPickle as pickle import re from beautifulsoup import BeautifulSoup +from beautifulsoup.builder import registry from beautifulsoup.element import CData, SoupStrainer, Tag from beautifulsoup.testing import SoupTest @@ -523,7 +524,7 @@ class TestTreeModification(SoupTest): self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>')) def test_new_tag_creation(self): - builder = BeautifulSoup.default_builder() + builder = registry.lookup('html5lib')() soup = self.soup("<body></body>", builder=builder) a = Tag(soup, builder, 'a') ol = Tag(soup, builder, 'ol') |