summaryrefslogtreecommitdiff
path: root/bs4
diff options
context:
space:
mode:
Diffstat (limited to 'bs4')
-rw-r--r--bs4/builder/_htmlparser.py12
-rw-r--r--bs4/tests/test_builder_registry.py131
-rw-r--r--bs4/tests/test_html5lib.py17
-rw-r--r--bs4/tests/test_tree.py2
4 files changed, 158 insertions, 4 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index c293d9e..f9476cd 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -4,7 +4,12 @@ __all__ = [
'HTMLParserTreeBuilder',
]
-from HTMLParser import HTMLParser
+try:
+ from html.parser import HTMLParser
+ CONSTRUCTOR_TAKES_STRICT = True
+except ImportError, e:
+ from HTMLParser import HTMLParser
+ CONSTRUCTOR_TAKES_STRICT = False
from bs4.element import (
CData,
Comment,
@@ -28,6 +33,11 @@ class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder):
is_xml = False
features = [HTML, STRICT, HTMLPARSER]
+ def __init__(self, *args, **kwargs):
+ if CONSTRUCTOR_TAKES_STRICT:
+ kwargs['strict'] = True
+ return super(HTMLParserTreeBuilder, self).__init__(*args, **kwargs)
+
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
"""
diff --git a/bs4/tests/test_builder_registry.py b/bs4/tests/test_builder_registry.py
new file mode 100644
index 0000000..17caace
--- /dev/null
+++ b/bs4/tests/test_builder_registry.py
@@ -0,0 +1,131 @@
+"""Tests of the builder registry."""
+
+import unittest
+
+from bs4 import BeautifulSoup
+from bs4.builder import (
+ builder_registry as registry,
+ HTMLParserTreeBuilder,
+ LXMLTreeBuilderForXML,
+ LXMLTreeBuilder,
+ TreeBuilderRegistry,
+)
+
+try:
+ from bs4.builder import (
+ HTML5TreeBuilder,
+ )
+except ImportError:
+ HTML5LIB_PRESENT = False
+
+
+class BuiltInRegistryTest(unittest.TestCase):
+ """Test the built-in registry with the default builders registered."""
+
+ def test_combination(self):
+ self.assertEquals(registry.lookup('fast', 'html'),
+ LXMLTreeBuilder)
+ self.assertEquals(registry.lookup('permissive', 'xml'),
+ LXMLTreeBuilderForXML)
+ self.assertEquals(registry.lookup('strict', 'html'),
+ HTMLParserTreeBuilder)
+ if HTML5LIB_PRESENT:
+ self.assertEquals(registry.lookup('permissive', 'html'),
+ HTML5TreeBuilder)
+
+ def test_lookup_by_markup_type(self):
+ if HTML5LIB_PRESENT:
+ self.assertEquals(registry.lookup('html'), HTML5TreeBuilder)
+ else:
+ self.assertEquals(registry.lookup('html'), LXMLTreeBuilder)
+ self.assertEquals(registry.lookup('xml'), LXMLTreeBuilderForXML)
+
+ def test_named_library(self):
+ self.assertEquals(registry.lookup('lxml', 'xml'),
+ LXMLTreeBuilderForXML)
+ self.assertEquals(registry.lookup('lxml', 'html'),
+ LXMLTreeBuilder)
+ if HTML5LIB_PRESENT:
+ self.assertEquals(registry.lookup('html5lib'),
+ HTML5TreeBuilder)
+
+ self.assertEquals(registry.lookup('html.parser'),
+ HTMLParserTreeBuilder)
+
+ def test_unimplemented_combinations(self):
+ self.assertEquals(registry.lookup('fast', 'permissive', 'html'),
+ None)
+
+ def test_beautifulsoup_constructor_does_lookup(self):
+ # You can pass in a string.
+ BeautifulSoup("", features="html")
+ # Or a list of strings.
+ BeautifulSoup("", features=["html", "fast"])
+
+ # You'll get an exception if BS can't find an appropriate
+ # builder.
+ self.assertRaises(ValueError, BeautifulSoup,
+ "", features="no-such-feature")
+
+class RegistryTest(unittest.TestCase):
+ """Test the TreeBuilderRegistry class in general."""
+
+ def setUp(self):
+ self.registry = TreeBuilderRegistry()
+
+ def builder_for_features(self, *feature_list):
+ cls = type('Builder_' + '_'.join(feature_list),
+ (object,), {'features' : feature_list})
+
+ self.registry.register(cls)
+ return cls
+
+ def test_register_with_no_features(self):
+ builder = self.builder_for_features()
+
+ # Since the builder advertises no features, you can't find it
+ # by looking up features.
+ self.assertEquals(self.registry.lookup('foo'), None)
+
+ # But you can find it by doing a lookup with no features, if
+ # this happens to be the only registered builder.
+ self.assertEquals(self.registry.lookup(), builder)
+
+ def test_register_with_features_makes_lookup_succeed(self):
+ builder = self.builder_for_features('foo', 'bar')
+ self.assertEquals(self.registry.lookup('foo'), builder)
+ self.assertEquals(self.registry.lookup('bar'), builder)
+
+ def test_lookup_fails_when_no_builder_implements_feature(self):
+ builder = self.builder_for_features('foo', 'bar')
+ self.assertEquals(self.registry.lookup('baz'), None)
+
+ def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
+ builder1 = self.builder_for_features('foo')
+ builder2 = self.builder_for_features('bar')
+ self.assertEquals(self.registry.lookup(), builder2)
+
+ def test_lookup_fails_when_no_tree_builders_registered(self):
+ self.assertEquals(self.registry.lookup(), None)
+
+ def test_lookup_gets_most_recent_builder_supporting_all_features(self):
+ has_one = self.builder_for_features('foo')
+ has_the_other = self.builder_for_features('bar')
+ has_both_early = self.builder_for_features('foo', 'bar', 'baz')
+ has_both_late = self.builder_for_features('foo', 'bar', 'quux')
+ lacks_one = self.builder_for_features('bar')
+ has_the_other = self.builder_for_features('foo')
+
+ # There are two builders featuring 'foo' and 'bar', but
+ # the one that also features 'quux' was registered later.
+ self.assertEquals(self.registry.lookup('foo', 'bar'),
+ has_both_late)
+
+ # There is only one builder featuring 'foo', 'bar', and 'baz'.
+ self.assertEquals(self.registry.lookup('foo', 'bar', 'baz'),
+ has_both_early)
+
+ def test_lookup_fails_when_cannot_reconcile_requested_features(self):
+ builder1 = self.builder_for_features('foo', 'bar')
+ builder2 = self.builder_for_features('foo', 'baz')
+ self.assertEquals(self.registry.lookup('bar', 'baz'), None)
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index 4d8dcc0..85cedbf 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -1,11 +1,19 @@
-from bs4.builder import HTML5TreeBuilder
+try:
+ from bs4.builder import HTML5TreeBuilder
+ HTML5LIB_PRESENT = True
+except ImportError, e:
+ HTML5LIB_PRESENT = False
from bs4.element import Comment, SoupStrainer
from test_lxml import (
TestLXMLBuilder,
TestLXMLBuilderInvalidMarkup,
TestLXMLBuilderEncodingConversion,
)
+import unittest
+@unittest.skipIf(
+ not HTML5LIB_PRESENT,
+ "html5lib seems not to be present, not testing its tree builder.")
class TestHTML5Builder(TestLXMLBuilder):
"""See `BuilderSmokeTest`."""
@@ -73,7 +81,9 @@ class TestHTML5Builder(TestLXMLBuilder):
# get a CData object.
self.assertSoupEquals(markup, "<svg><!--[CDATA[foobar]]--></svg>")
-
+@unittest.skipIf(
+ not HTML5LIB_PRESENT,
+ "html5lib seems not to be present, not testing it on invalid markup.")
class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
"""See `BuilderInvalidMarkupSmokeTest`."""
@@ -210,6 +220,9 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
self.assertEquals(soup.p.string, u"\N{REPLACEMENT CHARACTER}")
+@unittest.skipIf(
+ not HTML5LIB_PRESENT,
+ "html5lib seems not to be present, not testing encoding conversion.")
class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion):
@property
def default_builder(self):
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 68677ca..865ac68 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -546,7 +546,7 @@ class TestTreeModification(SoupTest):
self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>'))
def test_new_tag_creation(self):
- builder = builder_registry.lookup('html5lib')()
+ builder = builder_registry.lookup('html')()
soup = self.soup("<body></body>", builder=builder)
a = Tag(soup, builder, 'a')
ol = Tag(soup, builder, 'ol')