diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-07 23:22:13 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-07 23:22:13 -0500 |
commit | 7cb84c432367c52702920d68ec6c9669e9b6c9db (patch) | |
tree | 917850eedaed45168d419cf95cfe83a140fd3a00 | |
parent | 62d5de7f5ac4211b688665dd5912d4c4fd82e95c (diff) |
Newly created tags use the same empty-element rules as the builder used to originally create the soup.
-rw-r--r-- | CHANGELOG | 27 | ||||
-rw-r--r-- | TODO | 4 | ||||
-rw-r--r-- | bs4/__init__.py | 6 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 19 | ||||
-rw-r--r-- | bs4/tests/test_builder_registry.py | 14 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 26 | ||||
-rw-r--r-- | doc/source/index.rst | 48 |
7 files changed, 98 insertions, 46 deletions
@@ -1,13 +1,30 @@ = 4.0 beta 4 = -Added BeautifulSoup.new_string() to go along with Beautifulsoup.new_tag() -Pass in strict=False to html.parser on Python 3. -Monkeypatch a serious bug in html.parser that made strict=False disastrous on Python 3.2.2. +Added BeautifulSoup.new_string() to go along with BeautifulSoup.new_tag() + +BeautifulSoup.new_tag() will follow the rules of whatever tree-builder +was used to create the original BeautifulSoup object. A new <p> tag +will look like "<p />" if the soup object was created to parse XML, +but it will look like "<p></p>" if the soup object was created to +parse HTML. + +We pass in strict=False to html.parser on Python 3, greatly improving +html.parser's ability to handle bad HTML. + +Monkeypatch a serious bug in html.parser that made strict=False +disastrous on Python 3.2.2. + Replaced the "substitute_html_entities" argument with the "formatter" argument. -Bare ampersands and angle brackets are always converted to XML entities unless the user prevents it. + +Bare ampersands and angle brackets are always converted to XML +entities unless the user prevents it. + Added PageElement.insert_before(). + Added PageElement.insert_after(). -Raise an exception when the user tries to do something stupid like insert a tag into itself. + +Raise an exception when the user tries to do something nonsensical +like insert a tag into itself. = 4.0 = @@ -11,10 +11,6 @@ Big features * Add namespace support. -* soup.new_tag("<br>") should create an empty-element tag if the soup -was created with an HTML-aware builder, but not otherwise. This -requires keeping around information about the builder. - Optimizations ------------- diff --git a/bs4/__init__.py b/bs4/__init__.py index af4563f..ea6dd25 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -169,10 +169,10 @@ class BeautifulSoup(Tag): except StopParsing: pass - # Clear out the markup and the builder so they can be CGed. + # Clear out the markup and remove the builder's circular + # reference to this object. self.markup = None self.builder.soup = None - self.builder = None def _feed(self): # Convert the document to Unicode. @@ -195,7 +195,7 @@ class BeautifulSoup(Tag): def new_tag(self, name, **attrs): """Create a new tag associated with this soup.""" - return Tag(None, None, name, attrs) + return Tag(None, self.builder, name, attrs) def new_string(self, s): """Create a new NavigableString associated with this soup.""" diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index e6d4fa1..a17dce6 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -72,7 +72,6 @@ class TreeBuilderRegistry(object): # to look up builders in this registry. builder_registry = TreeBuilderRegistry() - class TreeBuilder(object): """Turn a document into a Beautiful Soup object tree.""" @@ -244,20 +243,20 @@ def register_treebuilders_from(module): this_module.builder_registry.register(obj) # Builders are registered in reverse order of priority, so that custom -# builder registrations will take precedence. In general, we want -# html5lib to take precedence over lxml, because it's more -# reliable. And we only want to use HTMLParser as a last result. +# builder registrations will take precedence. In general, we want lxml +# to take precedence over html5lib, because it's faster. And we only +# want to use HTMLParser as a last result. from .import _htmlparser register_treebuilders_from(_htmlparser) try: - from . import _lxml - register_treebuilders_from(_lxml) -except ImportError: - # They don't have lxml installed. - pass -try: from . import _html5lib register_treebuilders_from(_html5lib) except ImportError: # They don't have html5lib installed. pass +try: + from . import _lxml + register_treebuilders_from(_lxml) +except ImportError: + # They don't have lxml installed. + pass diff --git a/bs4/tests/test_builder_registry.py b/bs4/tests/test_builder_registry.py index 4a60bc1..5f60462 100644 --- a/bs4/tests/test_builder_registry.py +++ b/bs4/tests/test_builder_registry.py @@ -17,6 +17,12 @@ try: except ImportError: HTML5LIB_PRESENT = False +try: + from bs4.builder import LXMLTreeBuilder + LXML_PRESENT = True +except ImportError: + LXML_PRESENT = False + class BuiltInRegistryTest(unittest.TestCase): """Test the built-in registry with the default builders registered.""" @@ -29,14 +35,14 @@ class BuiltInRegistryTest(unittest.TestCase): self.assertEqual(registry.lookup('strict', 'html'), HTMLParserTreeBuilder) if HTML5LIB_PRESENT: - self.assertEqual(registry.lookup('permissive', 'html'), + self.assertEqual(registry.lookup('html5lib', 'html'), HTML5TreeBuilder) def test_lookup_by_markup_type(self): - if HTML5LIB_PRESENT: - self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) - else: + if LXML_PRESENT: self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) + else: + self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) def test_named_library(self): diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 97dc5e6..692260c 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -15,7 +15,7 @@ import re import warnings from bs4 import BeautifulSoup from bs4.builder import builder_registry -from bs4.element import CData, SoupStrainer, Tag +from bs4.element import CData, NavigableString, SoupStrainer, Tag from bs4.testing import SoupTest class TreeTest(SoupTest): @@ -535,6 +535,30 @@ class TestTagCreation(SoupTest): self.assertEqual(dict(bar="baz"), new_tag.attrs) self.assertEqual(None, new_tag.parent) + def test_tag_inherits_self_closing_rules_from_builder(self): + xml_soup = BeautifulSoup("", "xml") + xml_br = xml_soup.new_tag("br") + xml_p = xml_soup.new_tag("p") + + # Both the <br> and <p> tag are empty-element, just because + # they have no contents. + self.assertEqual(b"<br />", xml_br.encode()) + self.assertEqual(b"<p />", xml_p.encode()) + + html_soup = BeautifulSoup("", "html") + html_br = html_soup.new_tag("br") + html_p = html_soup.new_tag("p") + + # The HTML builder users HTML's rules about which tags are + # empty-element tags, and the new tags reflect these rules. + self.assertEqual(b"<br />", html_br.encode()) + self.assertEqual(b"<p></p>", html_p.encode()) + + def test_new_string_creates_navigablestring(self): + soup = self.soup("") + s = soup.new_string("foo") + self.assertEqual("foo", s) + self.assertTrue(isinstance(s, NavigableString)) class TestTreeModification(SoupTest): diff --git a/doc/source/index.rst b/doc/source/index.rst index 75be6da..fa0648d 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -152,10 +152,11 @@ Installing Beautiful Soup ========================= Beautiful Soup 4 is published through PyPi, so you can install it with -``easy_install``. The package name is ``beautifulsoup4``, and the same -package works on Python 2 and Python 3. +``easy_install`` or ``pip``. The package name is ``beautifulsoup4``, +and the same package works on Python 2 and Python 3. :kbd:`$ easy_install beautifulsoup4` +:kbd:`$ pip install beautifulsoup4` (The ``BeautifulSoup`` package is probably `not` what you want. That's the previous major release, `Beautiful Soup 3`_. Lots of software uses @@ -163,11 +164,10 @@ BS3, so it's still available, but if you're writing new code you should install ``beautifulsoup4``.) You can also `download the Beautiful Soup 4 source tarball -<http://www.crummy.com/software/BeautifulSoup/download/4.x/beautifulsoup4-4.0.0b3.tar.gz>`_ -and install it with ``setup.py``. The license for Beautiful Soup -allows you to package the entire library with your application, so you -can also download the tarball and insert the ``bs4`` directory into -your application's codebase. +<http://www.crummy.com/software/BeautifulSoup/download/4.x/>`_ and +install it with ``setup.py``. The license for Beautiful Soup allows +you to package the entire library with your application, allowing you +to copy the ``bs4`` directory into your application's codebase. I use Python 2.7 and Python 3.2 to develop Beautiful Soup, but it should work with other recent versions. @@ -177,10 +177,15 @@ should work with other recent versions. Be sure to install a good parser! --------------------------------- -By default, Beautiful Soup uses the HTML parser that comes with -Python. Unfortunately, that parser is not very good at handling bad -HTML. I recommend you install the `lxml parser -<http://lxml.de/>`_. It's very fast, it works with both Python 2 and +Beautiful Soup uses a plugin system that supports a number of popular +Python parsers. If no third-party parsers are installed, Beautiful +Soup uses the HTML parser that comes with Python. In recent releases +of Python (2.7.2 and 3.2.2), this parser works pretty well at handling +bad HTML. In older releases, it's not so good. + +Even if you're using a recent release of Python, I recommend you +install the `lxml parser <http://lxml.de/>`_ if possible. It's much +faster than Python's built-in parser. It works with both Python 2 and Python 3, and it parses HTML and XML very well. Beautiful Soup will detect that you have lxml installed, and use it instead of Python's built-in parser. @@ -191,6 +196,8 @@ Depending on your setup, you might install lxml with one of these commands: :kbd:`$ easy_install lxml` +:kbd:`$ pip install lxml` + If you're using Python 2, another alternative is the pure-Python `html5lib parser <http://code.google.com/p/html5lib/>`_, which parses HTML the way a web browser does. Depending on your setup, you might @@ -200,6 +207,8 @@ install html5lib with one of these commands: :kbd:`$ easy_install html5lib` +:kbd:`$ pip install html5lib` + Making the soup =============== @@ -1464,7 +1473,7 @@ like calling ``.append()`` on a Python list:: soup.a.contents # [u'Foo', u'Bar'] -``BeautifulSoup.new_tag()`` and ``new_string()`` +``BeautifulSoup.new_string()`` and ``.new_tag()`` ------------------------------------------------ If you need to add a string to a document, no problem--you can pass a @@ -1487,7 +1496,7 @@ call the factory method ``BeautifulSoup.new_tag()``:: soup = BeautifulSoup("<b></b>") original_tag = soup.b - new_tag = soup.new_tag("a", dict(href="http://www.example.com")) + new_tag = soup.new_tag("a", href="http://www.example.com") original_tag.append(new_tag) original_tag # <b><a href="http://www.example.com"></a></b> @@ -1519,8 +1528,8 @@ say. It works just like ``.insert()`` on a Python list:: ``move_before()`` and ``move_after()`` ------------------------------------------ -The ``move_before()`` method adds a tag or string to the parse tree -immediately before something else:: +The ``move_before()`` method moves a tag or string so that it +immediately precedes something else in the parse tree:: soup = BeautifulSoup("<b>stop</b>") tag = soup.new_tag("i") @@ -1529,8 +1538,8 @@ immediately before something else:: soup.b # <b><i>Don't</i>stop</b> -The ``move_after()`` method adds a tag or string to the parse tree -immediately `after` something else:: +The ``move_after()`` method moves a tag or string so that it +immediately follows something else in the parse tree:: soup.new_string(" ever ").move_after(soup.b.i) soup.b @@ -2232,11 +2241,12 @@ Beautiful Soup 3.2.0 is the old version, the last release of the Beautiful Soup 3 series. It's currently the version packaged with all major Linux distributions:: - $ apt-get install python-beautifulsoup +:kbd:`$ apt-get install python-beautifulsoup` It's also published through PyPi as `BeautifulSoup`.:: - $ easy_install BeautifulSoup +:kbd:`$ easy_install BeautifulSoup` +:kbd:`$ pip install BeautifulSoup` You can also `download a tarball of Beautiful Soup 3.2.0 <http://www.crummy.com/software/BeautifulSoup/bs3/download/3.x/BeautifulSoup-3.2.0.tar.gz>`_. |