diff options
-rw-r--r-- | NEWS.txt | 11 | ||||
-rw-r--r-- | bs4/__init__.py | 4 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 3 | ||||
-rw-r--r-- | bs4/element.py | 2 | ||||
-rw-r--r-- | bs4/testing.py | 5 | ||||
-rw-r--r-- | bs4/tests/test_lxml.py | 13 |
6 files changed, 37 insertions, 1 deletions
@@ -8,6 +8,13 @@ you strings that are visible in the document--no comments or processing commands. [bug=1050164] +* The BeautifulSoup class is now aliased to "_s" and "_soup", making + it quicker to type an import statement in an interactive session: + + from bs4 import _s + or + from bs4 import _soup + * Fix a bug in the html5lib treebuilder which sometimes created disconnected trees. [bug=1039527] @@ -20,6 +27,10 @@ * Stop a crash when unwisely messing with a tag that's been decomposed. [bug=1097699] +* Now that lxml's segfault on invalid doctype has been fixed, fix a + corresponding problem on the Beautiful Soup end that was previously + invisible. [bug=984936] + = 4.1.3 (20120820) = * Skipped a test under Python 2.6 and Python 3.1 to avoid a spurious diff --git a/bs4/__init__.py b/bs4/__init__.py index fe2656b..88177d6 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -335,6 +335,10 @@ class BeautifulSoup(Tag): return prefix + super(BeautifulSoup, self).decode( indent_level, eventual_encoding, formatter) +# Alias to make it easier to type import: 'from bs4 import _soup' +_s = BeautifulSoup +_soup = BeautifulSoup + class BeautifulStoneSoup(BeautifulSoup): """Deprecated interface to an XML parser.""" diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index ede5cec..e34c9fa 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -85,6 +85,9 @@ class BeautifulSoupHTMLParser(HTMLParser): self.soup.endData() if data.startswith("DOCTYPE "): data = data[len("DOCTYPE "):] + elif data == 'DOCTYPE': + # i.e. "<!DOCTYPE>" + data = '' self.soup.handle_data(data) self.soup.endData(Doctype) diff --git a/bs4/element.py b/bs4/element.py index c081eba..5ccb019 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -779,7 +779,7 @@ class Doctype(PreformattedString): @classmethod def for_name_and_ids(cls, name, pub_id, system_id): - value = name + value = name or '' if pub_id is not None: value += ' PUBLIC "%s"' % pub_id if system_id is not None: diff --git a/bs4/testing.py b/bs4/testing.py index c9307d3..ed71d3b 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -81,6 +81,11 @@ class HTMLTreeBuilderSmokeTest(object): self.assertDoctypeHandled( 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') + def test_empty_doctype(self): + soup = self.soup("<!DOCTYPE>") + doctype = soup.contents[0] + self.assertEqual("", doctype.strip()) + def test_public_doctype_with_url(self): doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' self.assertDoctypeHandled(doctype) diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index 39e26bf..f32fc2b 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -6,6 +6,8 @@ import warnings try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True + import lxml.etree + LXML_VERSION = lxml.etree.LXML_VERSION except ImportError, e: LXML_PRESENT = False @@ -41,6 +43,17 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): self.assertSoupEquals( "<p>foo�bar</p>", "<p>foobar</p>") + # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this + # test if an old version of lxml is installed. + + @skipIf( + LXML_VERSION < (2,3,5,0), + "Skipping doctype test for old version of lxml to avoid segfault.") + def test_empty_doctype(self): + soup = self.soup("<!DOCTYPE>") + doctype = soup.contents[0] + self.assertEqual("", doctype.strip()) + def test_beautifulstonesoup_is_xml_parser(self): # Make sure that the deprecated BSS class uses an xml builder # if one is installed. |