diff options
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/builder/_htmlparser.py | 12 | ||||
-rw-r--r-- | bs4/tests/test_builder_registry.py | 131 | ||||
-rw-r--r-- | bs4/tests/test_html5lib.py | 17 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 2 |
4 files changed, 158 insertions, 4 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index c293d9e..f9476cd 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -4,7 +4,12 @@ __all__ = [ 'HTMLParserTreeBuilder', ] -from HTMLParser import HTMLParser +try: + from html.parser import HTMLParser + CONSTRUCTOR_TAKES_STRICT = True +except ImportError, e: + from HTMLParser import HTMLParser + CONSTRUCTOR_TAKES_STRICT = False from bs4.element import ( CData, Comment, @@ -28,6 +33,11 @@ class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder): is_xml = False features = [HTML, STRICT, HTMLPARSER] + def __init__(self, *args, **kwargs): + if CONSTRUCTOR_TAKES_STRICT: + kwargs['strict'] = True + return super(HTMLParserTreeBuilder, self).__init__(*args, **kwargs) + def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ diff --git a/bs4/tests/test_builder_registry.py b/bs4/tests/test_builder_registry.py new file mode 100644 index 0000000..17caace --- /dev/null +++ b/bs4/tests/test_builder_registry.py @@ -0,0 +1,131 @@ +"""Tests of the builder registry.""" + +import unittest + +from bs4 import BeautifulSoup +from bs4.builder import ( + builder_registry as registry, + HTMLParserTreeBuilder, + LXMLTreeBuilderForXML, + LXMLTreeBuilder, + TreeBuilderRegistry, +) + +try: + from bs4.builder import ( + HTML5TreeBuilder, + ) +except ImportError: + HTML5LIB_PRESENT = False + + +class BuiltInRegistryTest(unittest.TestCase): + """Test the built-in registry with the default builders registered.""" + + def test_combination(self): + self.assertEquals(registry.lookup('fast', 'html'), + LXMLTreeBuilder) + self.assertEquals(registry.lookup('permissive', 'xml'), + LXMLTreeBuilderForXML) + self.assertEquals(registry.lookup('strict', 'html'), + HTMLParserTreeBuilder) + if HTML5LIB_PRESENT: + self.assertEquals(registry.lookup('permissive', 'html'), + HTML5TreeBuilder) + + def test_lookup_by_markup_type(self): + if HTML5LIB_PRESENT: + self.assertEquals(registry.lookup('html'), HTML5TreeBuilder) + else: + self.assertEquals(registry.lookup('html'), LXMLTreeBuilder) + self.assertEquals(registry.lookup('xml'), LXMLTreeBuilderForXML) + + def test_named_library(self): + self.assertEquals(registry.lookup('lxml', 'xml'), + LXMLTreeBuilderForXML) + self.assertEquals(registry.lookup('lxml', 'html'), + LXMLTreeBuilder) + if HTML5LIB_PRESENT: + self.assertEquals(registry.lookup('html5lib'), + HTML5TreeBuilder) + + self.assertEquals(registry.lookup('html.parser'), + HTMLParserTreeBuilder) + + def test_unimplemented_combinations(self): + self.assertEquals(registry.lookup('fast', 'permissive', 'html'), + None) + + def test_beautifulsoup_constructor_does_lookup(self): + # You can pass in a string. + BeautifulSoup("", features="html") + # Or a list of strings. + BeautifulSoup("", features=["html", "fast"]) + + # You'll get an exception if BS can't find an appropriate + # builder. + self.assertRaises(ValueError, BeautifulSoup, + "", features="no-such-feature") + +class RegistryTest(unittest.TestCase): + """Test the TreeBuilderRegistry class in general.""" + + def setUp(self): + self.registry = TreeBuilderRegistry() + + def builder_for_features(self, *feature_list): + cls = type('Builder_' + '_'.join(feature_list), + (object,), {'features' : feature_list}) + + self.registry.register(cls) + return cls + + def test_register_with_no_features(self): + builder = self.builder_for_features() + + # Since the builder advertises no features, you can't find it + # by looking up features. + self.assertEquals(self.registry.lookup('foo'), None) + + # But you can find it by doing a lookup with no features, if + # this happens to be the only registered builder. + self.assertEquals(self.registry.lookup(), builder) + + def test_register_with_features_makes_lookup_succeed(self): + builder = self.builder_for_features('foo', 'bar') + self.assertEquals(self.registry.lookup('foo'), builder) + self.assertEquals(self.registry.lookup('bar'), builder) + + def test_lookup_fails_when_no_builder_implements_feature(self): + builder = self.builder_for_features('foo', 'bar') + self.assertEquals(self.registry.lookup('baz'), None) + + def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): + builder1 = self.builder_for_features('foo') + builder2 = self.builder_for_features('bar') + self.assertEquals(self.registry.lookup(), builder2) + + def test_lookup_fails_when_no_tree_builders_registered(self): + self.assertEquals(self.registry.lookup(), None) + + def test_lookup_gets_most_recent_builder_supporting_all_features(self): + has_one = self.builder_for_features('foo') + has_the_other = self.builder_for_features('bar') + has_both_early = self.builder_for_features('foo', 'bar', 'baz') + has_both_late = self.builder_for_features('foo', 'bar', 'quux') + lacks_one = self.builder_for_features('bar') + has_the_other = self.builder_for_features('foo') + + # There are two builders featuring 'foo' and 'bar', but + # the one that also features 'quux' was registered later. + self.assertEquals(self.registry.lookup('foo', 'bar'), + has_both_late) + + # There is only one builder featuring 'foo', 'bar', and 'baz'. + self.assertEquals(self.registry.lookup('foo', 'bar', 'baz'), + has_both_early) + + def test_lookup_fails_when_cannot_reconcile_requested_features(self): + builder1 = self.builder_for_features('foo', 'bar') + builder2 = self.builder_for_features('foo', 'baz') + self.assertEquals(self.registry.lookup('bar', 'baz'), None) diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index 4d8dcc0..85cedbf 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -1,11 +1,19 @@ -from bs4.builder import HTML5TreeBuilder +try: + from bs4.builder import HTML5TreeBuilder + HTML5LIB_PRESENT = True +except ImportError, e: + HTML5LIB_PRESENT = False from bs4.element import Comment, SoupStrainer from test_lxml import ( TestLXMLBuilder, TestLXMLBuilderInvalidMarkup, TestLXMLBuilderEncodingConversion, ) +import unittest +@unittest.skipIf( + not HTML5LIB_PRESENT, + "html5lib seems not to be present, not testing its tree builder.") class TestHTML5Builder(TestLXMLBuilder): """See `BuilderSmokeTest`.""" @@ -73,7 +81,9 @@ class TestHTML5Builder(TestLXMLBuilder): # get a CData object. self.assertSoupEquals(markup, "<svg><!--[CDATA[foobar]]--></svg>") - +@unittest.skipIf( + not HTML5LIB_PRESENT, + "html5lib seems not to be present, not testing it on invalid markup.") class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): """See `BuilderInvalidMarkupSmokeTest`.""" @@ -210,6 +220,9 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): self.assertEquals(soup.p.string, u"\N{REPLACEMENT CHARACTER}") +@unittest.skipIf( + not HTML5LIB_PRESENT, + "html5lib seems not to be present, not testing encoding conversion.") class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion): @property def default_builder(self): diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 68677ca..865ac68 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -546,7 +546,7 @@ class TestTreeModification(SoupTest): self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>')) def test_new_tag_creation(self): - builder = builder_registry.lookup('html5lib')() + builder = builder_registry.lookup('html')() soup = self.soup("<body></body>", builder=builder) a = Tag(soup, builder, 'a') ol = Tag(soup, builder, 'ol') |