summaryrefslogtreecommitdiff
path: root/bs4
diff options
context:
space:
mode:
Diffstat (limited to 'bs4')
-rw-r--r--bs4/__init__.py6
-rw-r--r--bs4/builder/__init__.py19
-rw-r--r--bs4/tests/test_builder_registry.py14
-rw-r--r--bs4/tests/test_tree.py26
4 files changed, 47 insertions, 18 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index af4563f..ea6dd25 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -169,10 +169,10 @@ class BeautifulSoup(Tag):
except StopParsing:
pass
- # Clear out the markup and the builder so they can be CGed.
+ # Clear out the markup and remove the builder's circular
+ # reference to this object.
self.markup = None
self.builder.soup = None
- self.builder = None
def _feed(self):
# Convert the document to Unicode.
@@ -195,7 +195,7 @@ class BeautifulSoup(Tag):
def new_tag(self, name, **attrs):
"""Create a new tag associated with this soup."""
- return Tag(None, None, name, attrs)
+ return Tag(None, self.builder, name, attrs)
def new_string(self, s):
"""Create a new NavigableString associated with this soup."""
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index e6d4fa1..a17dce6 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -72,7 +72,6 @@ class TreeBuilderRegistry(object):
# to look up builders in this registry.
builder_registry = TreeBuilderRegistry()
-
class TreeBuilder(object):
"""Turn a document into a Beautiful Soup object tree."""
@@ -244,20 +243,20 @@ def register_treebuilders_from(module):
this_module.builder_registry.register(obj)
# Builders are registered in reverse order of priority, so that custom
-# builder registrations will take precedence. In general, we want
-# html5lib to take precedence over lxml, because it's more
-# reliable. And we only want to use HTMLParser as a last result.
+# builder registrations will take precedence. In general, we want lxml
+# to take precedence over html5lib, because it's faster. And we only
+# want to use HTMLParser as a last result.
from .import _htmlparser
register_treebuilders_from(_htmlparser)
try:
- from . import _lxml
- register_treebuilders_from(_lxml)
-except ImportError:
- # They don't have lxml installed.
- pass
-try:
from . import _html5lib
register_treebuilders_from(_html5lib)
except ImportError:
# They don't have html5lib installed.
pass
+try:
+ from . import _lxml
+ register_treebuilders_from(_lxml)
+except ImportError:
+ # They don't have lxml installed.
+ pass
diff --git a/bs4/tests/test_builder_registry.py b/bs4/tests/test_builder_registry.py
index 4a60bc1..5f60462 100644
--- a/bs4/tests/test_builder_registry.py
+++ b/bs4/tests/test_builder_registry.py
@@ -17,6 +17,12 @@ try:
except ImportError:
HTML5LIB_PRESENT = False
+try:
+ from bs4.builder import LXMLTreeBuilder
+ LXML_PRESENT = True
+except ImportError:
+ LXML_PRESENT = False
+
class BuiltInRegistryTest(unittest.TestCase):
"""Test the built-in registry with the default builders registered."""
@@ -29,14 +35,14 @@ class BuiltInRegistryTest(unittest.TestCase):
self.assertEqual(registry.lookup('strict', 'html'),
HTMLParserTreeBuilder)
if HTML5LIB_PRESENT:
- self.assertEqual(registry.lookup('permissive', 'html'),
+ self.assertEqual(registry.lookup('html5lib', 'html'),
HTML5TreeBuilder)
def test_lookup_by_markup_type(self):
- if HTML5LIB_PRESENT:
- self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
- else:
+ if LXML_PRESENT:
self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
+ else:
+ self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
def test_named_library(self):
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 97dc5e6..692260c 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -15,7 +15,7 @@ import re
import warnings
from bs4 import BeautifulSoup
from bs4.builder import builder_registry
-from bs4.element import CData, SoupStrainer, Tag
+from bs4.element import CData, NavigableString, SoupStrainer, Tag
from bs4.testing import SoupTest
class TreeTest(SoupTest):
@@ -535,6 +535,30 @@ class TestTagCreation(SoupTest):
self.assertEqual(dict(bar="baz"), new_tag.attrs)
self.assertEqual(None, new_tag.parent)
+ def test_tag_inherits_self_closing_rules_from_builder(self):
+ xml_soup = BeautifulSoup("", "xml")
+ xml_br = xml_soup.new_tag("br")
+ xml_p = xml_soup.new_tag("p")
+
+ # Both the <br> and <p> tag are empty-element, just because
+ # they have no contents.
+ self.assertEqual(b"<br />", xml_br.encode())
+ self.assertEqual(b"<p />", xml_p.encode())
+
+ html_soup = BeautifulSoup("", "html")
+ html_br = html_soup.new_tag("br")
+ html_p = html_soup.new_tag("p")
+
+ # The HTML builder users HTML's rules about which tags are
+ # empty-element tags, and the new tags reflect these rules.
+ self.assertEqual(b"<br />", html_br.encode())
+ self.assertEqual(b"<p></p>", html_p.encode())
+
+ def test_new_string_creates_navigablestring(self):
+ soup = self.soup("")
+ s = soup.new_string("foo")
+ self.assertEqual("foo", s)
+ self.assertTrue(isinstance(s, NavigableString))
class TestTreeModification(SoupTest):