diff options
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/__init__.py | 6 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 19 | ||||
-rw-r--r-- | bs4/tests/test_builder_registry.py | 14 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 26 |
4 files changed, 47 insertions, 18 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index af4563f..ea6dd25 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -169,10 +169,10 @@ class BeautifulSoup(Tag): except StopParsing: pass - # Clear out the markup and the builder so they can be CGed. + # Clear out the markup and remove the builder's circular + # reference to this object. self.markup = None self.builder.soup = None - self.builder = None def _feed(self): # Convert the document to Unicode. @@ -195,7 +195,7 @@ class BeautifulSoup(Tag): def new_tag(self, name, **attrs): """Create a new tag associated with this soup.""" - return Tag(None, None, name, attrs) + return Tag(None, self.builder, name, attrs) def new_string(self, s): """Create a new NavigableString associated with this soup.""" diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index e6d4fa1..a17dce6 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -72,7 +72,6 @@ class TreeBuilderRegistry(object): # to look up builders in this registry. builder_registry = TreeBuilderRegistry() - class TreeBuilder(object): """Turn a document into a Beautiful Soup object tree.""" @@ -244,20 +243,20 @@ def register_treebuilders_from(module): this_module.builder_registry.register(obj) # Builders are registered in reverse order of priority, so that custom -# builder registrations will take precedence. In general, we want -# html5lib to take precedence over lxml, because it's more -# reliable. And we only want to use HTMLParser as a last result. +# builder registrations will take precedence. In general, we want lxml +# to take precedence over html5lib, because it's faster. And we only +# want to use HTMLParser as a last result. from .import _htmlparser register_treebuilders_from(_htmlparser) try: - from . import _lxml - register_treebuilders_from(_lxml) -except ImportError: - # They don't have lxml installed. - pass -try: from . import _html5lib register_treebuilders_from(_html5lib) except ImportError: # They don't have html5lib installed. pass +try: + from . import _lxml + register_treebuilders_from(_lxml) +except ImportError: + # They don't have lxml installed. + pass diff --git a/bs4/tests/test_builder_registry.py b/bs4/tests/test_builder_registry.py index 4a60bc1..5f60462 100644 --- a/bs4/tests/test_builder_registry.py +++ b/bs4/tests/test_builder_registry.py @@ -17,6 +17,12 @@ try: except ImportError: HTML5LIB_PRESENT = False +try: + from bs4.builder import LXMLTreeBuilder + LXML_PRESENT = True +except ImportError: + LXML_PRESENT = False + class BuiltInRegistryTest(unittest.TestCase): """Test the built-in registry with the default builders registered.""" @@ -29,14 +35,14 @@ class BuiltInRegistryTest(unittest.TestCase): self.assertEqual(registry.lookup('strict', 'html'), HTMLParserTreeBuilder) if HTML5LIB_PRESENT: - self.assertEqual(registry.lookup('permissive', 'html'), + self.assertEqual(registry.lookup('html5lib', 'html'), HTML5TreeBuilder) def test_lookup_by_markup_type(self): - if HTML5LIB_PRESENT: - self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) - else: + if LXML_PRESENT: self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) + else: + self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) def test_named_library(self): diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 97dc5e6..692260c 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -15,7 +15,7 @@ import re import warnings from bs4 import BeautifulSoup from bs4.builder import builder_registry -from bs4.element import CData, SoupStrainer, Tag +from bs4.element import CData, NavigableString, SoupStrainer, Tag from bs4.testing import SoupTest class TreeTest(SoupTest): @@ -535,6 +535,30 @@ class TestTagCreation(SoupTest): self.assertEqual(dict(bar="baz"), new_tag.attrs) self.assertEqual(None, new_tag.parent) + def test_tag_inherits_self_closing_rules_from_builder(self): + xml_soup = BeautifulSoup("", "xml") + xml_br = xml_soup.new_tag("br") + xml_p = xml_soup.new_tag("p") + + # Both the <br> and <p> tag are empty-element, just because + # they have no contents. + self.assertEqual(b"<br />", xml_br.encode()) + self.assertEqual(b"<p />", xml_p.encode()) + + html_soup = BeautifulSoup("", "html") + html_br = html_soup.new_tag("br") + html_p = html_soup.new_tag("p") + + # The HTML builder users HTML's rules about which tags are + # empty-element tags, and the new tags reflect these rules. + self.assertEqual(b"<br />", html_br.encode()) + self.assertEqual(b"<p></p>", html_p.encode()) + + def test_new_string_creates_navigablestring(self): + soup = self.soup("") + s = soup.new_string("foo") + self.assertEqual("foo", s) + self.assertTrue(isinstance(s, NavigableString)) class TestTreeModification(SoupTest): |