summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--beautifulsoup/__init__.py27
-rw-r--r--beautifulsoup/builder/__init__.py1
-rw-r--r--beautifulsoup/builder/_html5lib.py3
-rw-r--r--tests/test_tree.py3
4 files changed, 20 insertions, 14 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index b8598e2..93a610a 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -64,6 +64,7 @@ __all__ = ['BeautifulSoup']
import re
from util import isList, isString, buildSet
+from builder import registry
from dammit import UnicodeDammit
from element import Entities, NavigableString, Tag
@@ -92,29 +93,31 @@ class BeautifulSoup(Tag):
"""
ROOT_TAG_NAME = u'[document]'
+ # If the end-user gives no indication which tree builder they
+ # want, look for one with these features.
+ DEFAULT_BUILDER_FEATURES = ['html']
+
# Used when determining whether a text node is all whitespace and
# can be replaced with a single space. A text node that contains
# fancy Unicode spaces (usually non-breaking) should be left
# alone.
STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
- @classmethod
- def default_builder(self):
- try:
- from builder import HTML5TreeBuilder
- return HTML5TreeBuilder()
- except ImportError:
- from builder import LXMLTreeBuilder
- return LXMLTreeBuilder()
-
- def __init__(self, markup="", builder=None, parse_only=None,
- from_encoding=None):
+ def __init__(self, markup="", parse_only=None, from_encoding=None,
+ builder=None, *features):
"""The Soup object is initialized as the 'root tag', and the
provided markup (which can be a string or a file-like object)
is fed into the underlying parser."""
if builder is None:
- builder = self.default_builder()
+ if len(features) == 0:
+ features = self.DEFAULT_BUILDER_FEATURES
+ builder = registry.lookup(*features)
+ if builder is None:
+ raise ValueError(
+ "Couldn't find a tree builder with the features you "
+ "requested: %s. Do you need to install a parser library?"
+ % ",".join(features))
self.builder = builder
self.builder.soup = self
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index 854cc56..385dd50 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -15,6 +15,7 @@ FAST = 'fast'
PERMISSIVE = 'permissive'
XML = 'xml'
HTML = 'html'
+HTML_5 = 'html5'
class TreeBuilderRegistry(object):
diff --git a/beautifulsoup/builder/_html5lib.py b/beautifulsoup/builder/_html5lib.py
index 175ea9b..f8a7a40 100644
--- a/beautifulsoup/builder/_html5lib.py
+++ b/beautifulsoup/builder/_html5lib.py
@@ -5,6 +5,7 @@ __all__ = [
from beautifulsoup.builder import (
PERMISSIVE,
HTML,
+ HTML_5,
HTMLTreeBuilder,
)
import html5lib
@@ -20,7 +21,7 @@ from beautifulsoup.element import (
class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree."""
- features = ['html5lib', PERMISSIVE, HTML]
+ features = ['html5lib', PERMISSIVE, HTML_5, HTML]
def prepare_markup(self, markup, user_specified_encoding):
# Store the user-specified encoding for use later on.
diff --git a/tests/test_tree.py b/tests/test_tree.py
index cefdf4a..0232bac 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -13,6 +13,7 @@ import copy
import cPickle as pickle
import re
from beautifulsoup import BeautifulSoup
+from beautifulsoup.builder import registry
from beautifulsoup.element import CData, SoupStrainer, Tag
from beautifulsoup.testing import SoupTest
@@ -523,7 +524,7 @@ class TestTreeModification(SoupTest):
self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>'))
def test_new_tag_creation(self):
- builder = BeautifulSoup.default_builder()
+ builder = registry.lookup('html5lib')()
soup = self.soup("<body></body>", builder=builder)
a = Tag(soup, builder, 'a')
ol = Tag(soup, builder, 'ol')