Newly created tags use the same empty-element rules as the builder used to originally create the soup.

author: Leonard Richardson <leonard.richardson@canonical.com> 2012-02-07 23:22:13 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2012-02-07 23:22:13 -0500
commit: 7cb84c432367c52702920d68ec6c9669e9b6c9db (patch)
tree: 917850eedaed45168d419cf95cfe83a140fd3a00
parent: 62d5de7f5ac4211b688665dd5912d4c4fd82e95c (diff)
7 files changed, 98 insertions, 46 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 926500f..9e5ad32 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,13 +1,30 @@
 = 4.0 beta 4 =
 
-Added BeautifulSoup.new_string() to go along with Beautifulsoup.new_tag()
-Pass in strict=False to html.parser on Python 3.
-Monkeypatch a serious bug in html.parser that made strict=False disastrous on Python 3.2.2.
+Added BeautifulSoup.new_string() to go along with BeautifulSoup.new_tag()
+
+BeautifulSoup.new_tag() will follow the rules of whatever tree-builder
+was used to create the original BeautifulSoup object. A new <p> tag
+will look like "<p />" if the soup object was created to parse XML,
+but it will look like "<p></p>" if the soup object was created to
+parse HTML.
+
+We pass in strict=False to html.parser on Python 3, greatly improving
+html.parser's ability to handle bad HTML.
+
+Monkeypatch a serious bug in html.parser that made strict=False
+disastrous on Python 3.2.2.
+
 Replaced the "substitute_html_entities" argument with the "formatter" argument.
-Bare ampersands and angle brackets are always converted to XML entities unless the user prevents it.
+
+Bare ampersands and angle brackets are always converted to XML
+entities unless the user prevents it.
+
 Added PageElement.insert_before().
+
 Added PageElement.insert_after().
-Raise an exception when the user tries to do something stupid like insert a tag into itself.
+
+Raise an exception when the user tries to do something nonsensical
+like insert a tag into itself.
 
 = 4.0 =
 
diff --git a/TODO b/TODO
index 1920d01..2f03dd2 100644
--- a/TODO
+++ b/TODO
@@ -11,10 +11,6 @@ Big features
 
 * Add namespace support.
 
-* soup.new_tag("<br>") should create an empty-element tag if the soup
-was created with an HTML-aware builder, but not otherwise. This
-requires keeping around information about the builder.
-
 Optimizations
 -------------
 
diff --git a/bs4/__init__.py b/bs4/__init__.py
index af4563f..ea6dd25 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -169,10 +169,10 @@ class BeautifulSoup(Tag):
         except StopParsing:
             pass
 
-        # Clear out the markup and the builder so they can be CGed.
+        # Clear out the markup and remove the builder's circular
+        # reference to this object.
         self.markup = None
         self.builder.soup = None
-        self.builder = None
 
     def _feed(self):
         # Convert the document to Unicode.
@@ -195,7 +195,7 @@ class BeautifulSoup(Tag):
 
     def new_tag(self, name, **attrs):
         """Create a new tag associated with this soup."""
-        return Tag(None, None, name, attrs)
+        return Tag(None, self.builder, name, attrs)
 
     def new_string(self, s):
         """Create a new NavigableString associated with this soup."""
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index e6d4fa1..a17dce6 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -72,7 +72,6 @@ class TreeBuilderRegistry(object):
 # to look up builders in this registry.
 builder_registry = TreeBuilderRegistry()
 
-
 class TreeBuilder(object):
     """Turn a document into a Beautiful Soup object tree."""
 
@@ -244,20 +243,20 @@ def register_treebuilders_from(module):
             this_module.builder_registry.register(obj)
 
 # Builders are registered in reverse order of priority, so that custom
-# builder registrations will take precedence. In general, we want
-# html5lib to take precedence over lxml, because it's more
-# reliable. And we only want to use HTMLParser as a last result.
+# builder registrations will take precedence. In general, we want lxml
+# to take precedence over html5lib, because it's faster. And we only
+# want to use HTMLParser as a last result.
 from .import _htmlparser
 register_treebuilders_from(_htmlparser)
 try:
-    from . import _lxml
-    register_treebuilders_from(_lxml)
-except ImportError:
-    # They don't have lxml installed.
-    pass
-try:
     from . import _html5lib
     register_treebuilders_from(_html5lib)
 except ImportError:
     # They don't have html5lib installed.
     pass
+try:
+    from . import _lxml
+    register_treebuilders_from(_lxml)
+except ImportError:
+    # They don't have lxml installed.
+    pass
diff --git a/bs4/tests/test_builder_registry.py b/bs4/tests/test_builder_registry.py
index 4a60bc1..5f60462 100644
--- a/bs4/tests/test_builder_registry.py
+++ b/bs4/tests/test_builder_registry.py
@@ -17,6 +17,12 @@ try:
 except ImportError:
     HTML5LIB_PRESENT = False
 
+try:
+    from bs4.builder import LXMLTreeBuilder
+    LXML_PRESENT = True
+except ImportError:
+    LXML_PRESENT = False
+
 
 class BuiltInRegistryTest(unittest.TestCase):
     """Test the built-in registry with the default builders registered."""
@@ -29,14 +35,14 @@ class BuiltInRegistryTest(unittest.TestCase):
         self.assertEqual(registry.lookup('strict', 'html'),
                           HTMLParserTreeBuilder)
         if HTML5LIB_PRESENT:
-            self.assertEqual(registry.lookup('permissive', 'html'),
+            self.assertEqual(registry.lookup('html5lib', 'html'),
                               HTML5TreeBuilder)
 
     def test_lookup_by_markup_type(self):
-        if HTML5LIB_PRESENT:
-            self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
-        else:
+        if LXML_PRESENT:
             self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
+        else:
+            self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
         self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
 
     def test_named_library(self):
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 97dc5e6..692260c 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -15,7 +15,7 @@ import re
 import warnings
 from bs4 import BeautifulSoup
 from bs4.builder import builder_registry
-from bs4.element import CData, SoupStrainer, Tag
+from bs4.element import CData, NavigableString, SoupStrainer, Tag
 from bs4.testing import SoupTest
 
 class TreeTest(SoupTest):
@@ -535,6 +535,30 @@ class TestTagCreation(SoupTest):
         self.assertEqual(dict(bar="baz"), new_tag.attrs)
         self.assertEqual(None, new_tag.parent)
 
+    def test_tag_inherits_self_closing_rules_from_builder(self):
+        xml_soup = BeautifulSoup("", "xml")
+        xml_br = xml_soup.new_tag("br")
+        xml_p = xml_soup.new_tag("p")
+
+        # Both the <br> and <p> tag are empty-element, just because
+        # they have no contents.
+        self.assertEqual(b"<br />", xml_br.encode())
+        self.assertEqual(b"<p />", xml_p.encode())
+
+        html_soup = BeautifulSoup("", "html")
+        html_br = html_soup.new_tag("br")
+        html_p = html_soup.new_tag("p")
+
+        # The HTML builder users HTML's rules about which tags are
+        # empty-element tags, and the new tags reflect these rules.
+        self.assertEqual(b"<br />", html_br.encode())
+        self.assertEqual(b"<p></p>", html_p.encode())
+
+    def test_new_string_creates_navigablestring(self):
+        soup = self.soup("")
+        s = soup.new_string("foo")
+        self.assertEqual("foo", s)
+        self.assertTrue(isinstance(s, NavigableString))
 
 class TestTreeModification(SoupTest):
 
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 75be6da..fa0648d 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -152,10 +152,11 @@ Installing Beautiful Soup
 =========================
 
 Beautiful Soup 4 is published through PyPi, so you can install it with
-``easy_install``. The package name is ``beautifulsoup4``, and the same
-package works on Python 2 and Python 3.
+``easy_install`` or ``pip``. The package name is ``beautifulsoup4``,
+and the same package works on Python 2 and Python 3.
 
 :kbd:`$ easy_install beautifulsoup4`
+:kbd:`$ pip install beautifulsoup4`
 
 (The ``BeautifulSoup`` package is probably `not` what you want. That's
 the previous major release, `Beautiful Soup 3`_. Lots of software uses
@@ -163,11 +164,10 @@ BS3, so it's still available, but if you're writing new code you
 should install ``beautifulsoup4``.)
 
 You can also `download the Beautiful Soup 4 source tarball
-<http://www.crummy.com/software/BeautifulSoup/download/4.x/beautifulsoup4-4.0.0b3.tar.gz>`_
-and install it with ``setup.py``. The license for Beautiful Soup
-allows you to package the entire library with your application, so you
-can also download the tarball and insert the ``bs4`` directory into
-your application's codebase.
+<http://www.crummy.com/software/BeautifulSoup/download/4.x/>`_ and
+install it with ``setup.py``. The license for Beautiful Soup allows
+you to package the entire library with your application, allowing you
+to copy the ``bs4`` directory into your application's codebase.
 
 I use Python 2.7 and Python 3.2 to develop Beautiful Soup, but it
 should work with other recent versions.
@@ -177,10 +177,15 @@ should work with other recent versions.
 Be sure to install a good parser!
 ---------------------------------
 
-By default, Beautiful Soup uses the HTML parser that comes with
-Python. Unfortunately, that parser is not very good at handling bad
-HTML. I recommend you install the `lxml parser
-<http://lxml.de/>`_. It's very fast, it works with both Python 2 and
+Beautiful Soup uses a plugin system that supports a number of popular
+Python parsers. If no third-party parsers are installed, Beautiful
+Soup uses the HTML parser that comes with Python. In recent releases
+of Python (2.7.2 and 3.2.2), this parser works pretty well at handling
+bad HTML. In older releases, it's not so good.
+
+Even if you're using a recent release of Python, I recommend you
+install the `lxml parser <http://lxml.de/>`_ if possible. It's much
+faster than Python's built-in parser. It works with both Python 2 and
 Python 3, and it parses HTML and XML very well. Beautiful Soup will
 detect that you have lxml installed, and use it instead of Python's
 built-in parser.
@@ -191,6 +196,8 @@ Depending on your setup, you might install lxml with one of these commands:
 
 :kbd:`$ easy_install lxml`
 
+:kbd:`$ pip install lxml`
+
 If you're using Python 2, another alternative is the pure-Python
 `html5lib parser <http://code.google.com/p/html5lib/>`_, which parses
 HTML the way a web browser does. Depending on your setup, you might
@@ -200,6 +207,8 @@ install html5lib with one of these commands:
 
 :kbd:`$ easy_install html5lib`
 
+:kbd:`$ pip install html5lib`
+
 Making the soup
 ===============
 
@@ -1464,7 +1473,7 @@ like calling ``.append()`` on a Python list::
    soup.a.contents
    # [u'Foo', u'Bar']
 
-``BeautifulSoup.new_tag()`` and ``new_string()``
+``BeautifulSoup.new_string()`` and ``.new_tag()``
 ------------------------------------------------
 
 If you need to add a string to a document, no problem--you can pass a
@@ -1487,7 +1496,7 @@ call the factory method ``BeautifulSoup.new_tag()``::
    soup = BeautifulSoup("<b></b>")
    original_tag = soup.b
 
-   new_tag = soup.new_tag("a", dict(href="http://www.example.com"))
+   new_tag = soup.new_tag("a", href="http://www.example.com")
    original_tag.append(new_tag)
    original_tag
    # <b><a href="http://www.example.com"></a></b>
@@ -1519,8 +1528,8 @@ say. It works just like ``.insert()`` on a Python list::
 ``move_before()`` and ``move_after()``
 ------------------------------------------
 
-The ``move_before()`` method adds a tag or string to the parse tree
-immediately before something else::
+The ``move_before()`` method moves a tag or string so that it
+immediately precedes something else in the parse tree::
 
    soup = BeautifulSoup("<b>stop</b>")
    tag = soup.new_tag("i")
@@ -1529,8 +1538,8 @@ immediately before something else::
    soup.b
    # <b><i>Don't</i>stop</b>
 
-The ``move_after()`` method adds a tag or string to the parse tree
-immediately `after` something else::
+The ``move_after()`` method moves a tag or string so that it
+immediately follows something else in the parse tree::
 
    soup.new_string(" ever ").move_after(soup.b.i)
    soup.b
@@ -2232,11 +2241,12 @@ Beautiful Soup 3.2.0 is the old version, the last release of the
 Beautiful Soup 3 series. It's currently the version packaged with all
 major Linux distributions::
 
- $ apt-get install python-beautifulsoup
+:kbd:`$ apt-get install python-beautifulsoup`
 
 It's also published through PyPi as `BeautifulSoup`.::
 
- $ easy_install BeautifulSoup
+:kbd:`$ easy_install BeautifulSoup`
+:kbd:`$ pip install BeautifulSoup`
 
 You can also `download a tarball of Beautiful Soup 3.2.0
 <http://www.crummy.com/software/BeautifulSoup/bs3/download/3.x/BeautifulSoup-3.2.0.tar.gz>`_.
author	Leonard Richardson <leonard.richardson@canonical.com>	2012-02-07 23:22:13 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2012-02-07 23:22:13 -0500
commit	7cb84c432367c52702920d68ec6c9669e9b6c9db (patch)
tree	917850eedaed45168d419cf95cfe83a140fd3a00
parent	62d5de7f5ac4211b688665dd5912d4c4fd82e95c (diff)