From 269157a8f40dfdac082f39befd69f170263d2ce1 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Tue, 7 May 2013 08:19:02 -0400 Subject: Now that lxml's segfault on invalid doctype has been fixed, fix a corresponding problem on the Beautiful Soup end that was previously invisible. [bug=984936] --- NEWS.txt | 4 ++++ bs4/builder/_htmlparser.py | 3 +++ bs4/element.py | 2 +- bs4/testing.py | 5 +++++ bs4/tests/test_lxml.py | 19 +++++++++++++++++++ 5 files changed, 32 insertions(+), 1 deletion(-) diff --git a/NEWS.txt b/NEWS.txt index edbba28..c2739ca 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -17,6 +17,10 @@ * Fix a bug by which keyword arguments to find_parent() were not being passed on. [bug=1126734] +* Now that lxml's segfault on invalid doctype has been fixed, fix a + corresponding problem on the Beautiful Soup end that was previously + invisible. [bug=984936] + = 4.1.3 (20120820) = * Skipped a test under Python 2.6 and Python 3.1 to avoid a spurious diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index ede5cec..e34c9fa 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -85,6 +85,9 @@ class BeautifulSoupHTMLParser(HTMLParser): self.soup.endData() if data.startswith("DOCTYPE "): data = data[len("DOCTYPE "):] + elif data == 'DOCTYPE': + # i.e. "" + data = '' self.soup.handle_data(data) self.soup.endData(Doctype) diff --git a/bs4/element.py b/bs4/element.py index d58da92..f38d9b4 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -779,7 +779,7 @@ class Doctype(PreformattedString): @classmethod def for_name_and_ids(cls, name, pub_id, system_id): - value = name + value = name or '' if pub_id is not None: value += ' PUBLIC "%s"' % pub_id if system_id is not None: diff --git a/bs4/testing.py b/bs4/testing.py index c9307d3..ed71d3b 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -81,6 +81,11 @@ class HTMLTreeBuilderSmokeTest(object): self.assertDoctypeHandled( 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') + def test_empty_doctype(self): + soup = self.soup("") + doctype = soup.contents[0] + self.assertEqual("", doctype.strip()) + def test_public_doctype_with_url(self): doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' self.assertDoctypeHandled(doctype) diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index 39e26bf..693ec25 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -6,6 +6,14 @@ import warnings try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True + import lxml.etree + LXML_VERSION = [] + for i in lxml.etree.__version__.split('.'): + try: + part = int(i) + except TypeError: + part = 0 + LXML_VERSION.append(part) except ImportError, e: LXML_PRESENT = False @@ -41,6 +49,17 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): self.assertSoupEquals( "

foo�bar

", "

foobar

") + # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this + # test if an old version of lxml is installed. + + @skipIf( + LXML_VERSION < [2,3,5], + "Skipping doctype test for old version of lxml to avoid segfault.") + def test_empty_doctype(self): + soup = self.soup("") + doctype = soup.contents[0] + self.assertEqual("", doctype.strip()) + def test_beautifulstonesoup_is_xml_parser(self): # Make sure that the deprecated BSS class uses an xml builder # if one is installed. -- cgit v1.2.3 From e31151091c3dd44d0f39ba234df261f362199ae5 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Tue, 7 May 2013 08:36:07 -0400 Subject: Improved detection of lxml version number. --- bs4/tests/test_lxml.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index 693ec25..f32fc2b 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -7,13 +7,7 @@ try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True import lxml.etree - LXML_VERSION = [] - for i in lxml.etree.__version__.split('.'): - try: - part = int(i) - except TypeError: - part = 0 - LXML_VERSION.append(part) + LXML_VERSION = lxml.etree.LXML_VERSION except ImportError, e: LXML_PRESENT = False @@ -53,7 +47,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): # test if an old version of lxml is installed. @skipIf( - LXML_VERSION < [2,3,5], + LXML_VERSION < (2,3,5,0), "Skipping doctype test for old version of lxml to avoid segfault.") def test_empty_doctype(self): soup = self.soup("") -- cgit v1.2.3 From 5b3860ec348b66976de64b5be407704041102869 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Tue, 7 May 2013 08:40:35 -0400 Subject: Aliased the BeautifulSoup class to the easier-to-type "_s" and "_soup". --- NEWS.txt | 7 +++++++ bs4/__init__.py | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/NEWS.txt b/NEWS.txt index c2739ca..70a1dc7 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -8,6 +8,13 @@ you strings that are visible in the document--no comments or processing commands. [bug=1050164] +* The BeautifulSoup class is now aliased to "_s" and "_soup", making + it quicker to type an import statement in an interactive session: + + from bs4 import _s + or + from bs4 import _soup + * Fix a bug in the html5lib treebuilder which sometimes created disconnected trees. [bug=1039527] diff --git a/bs4/__init__.py b/bs4/__init__.py index fe2656b..88177d6 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -335,6 +335,10 @@ class BeautifulSoup(Tag): return prefix + super(BeautifulSoup, self).decode( indent_level, eventual_encoding, formatter) +# Alias to make it easier to type import: 'from bs4 import _soup' +_s = BeautifulSoup +_soup = BeautifulSoup + class BeautifulStoneSoup(BeautifulSoup): """Deprecated interface to an XML parser.""" -- cgit v1.2.3