diff options
-rw-r--r-- | NEWS.txt | 4 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 3 | ||||
-rw-r--r-- | bs4/element.py | 2 | ||||
-rw-r--r-- | bs4/testing.py | 5 | ||||
-rw-r--r-- | bs4/tests/test_lxml.py | 19 |
5 files changed, 32 insertions, 1 deletions
@@ -17,6 +17,10 @@ * Fix a bug by which keyword arguments to find_parent() were not being passed on. [bug=1126734] +* Now that lxml's segfault on invalid doctype has been fixed, fix a + corresponding problem on the Beautiful Soup end that was previously + invisible. [bug=984936] + = 4.1.3 (20120820) = * Skipped a test under Python 2.6 and Python 3.1 to avoid a spurious diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index ede5cec..e34c9fa 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -85,6 +85,9 @@ class BeautifulSoupHTMLParser(HTMLParser): self.soup.endData() if data.startswith("DOCTYPE "): data = data[len("DOCTYPE "):] + elif data == 'DOCTYPE': + # i.e. "<!DOCTYPE>" + data = '' self.soup.handle_data(data) self.soup.endData(Doctype) diff --git a/bs4/element.py b/bs4/element.py index d58da92..f38d9b4 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -779,7 +779,7 @@ class Doctype(PreformattedString): @classmethod def for_name_and_ids(cls, name, pub_id, system_id): - value = name + value = name or '' if pub_id is not None: value += ' PUBLIC "%s"' % pub_id if system_id is not None: diff --git a/bs4/testing.py b/bs4/testing.py index c9307d3..ed71d3b 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -81,6 +81,11 @@ class HTMLTreeBuilderSmokeTest(object): self.assertDoctypeHandled( 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') + def test_empty_doctype(self): + soup = self.soup("<!DOCTYPE>") + doctype = soup.contents[0] + self.assertEqual("", doctype.strip()) + def test_public_doctype_with_url(self): doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' self.assertDoctypeHandled(doctype) diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index 39e26bf..693ec25 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -6,6 +6,14 @@ import warnings try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True + import lxml.etree + LXML_VERSION = [] + for i in lxml.etree.__version__.split('.'): + try: + part = int(i) + except TypeError: + part = 0 + LXML_VERSION.append(part) except ImportError, e: LXML_PRESENT = False @@ -41,6 +49,17 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): self.assertSoupEquals( "<p>foo�bar</p>", "<p>foobar</p>") + # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this + # test if an old version of lxml is installed. + + @skipIf( + LXML_VERSION < [2,3,5], + "Skipping doctype test for old version of lxml to avoid segfault.") + def test_empty_doctype(self): + soup = self.soup("<!DOCTYPE>") + doctype = soup.contents[0] + self.assertEqual("", doctype.strip()) + def test_beautifulstonesoup_is_xml_parser(self): # Make sure that the deprecated BSS class uses an xml builder # if one is installed. |