summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt4
-rw-r--r--bs4/builder/_htmlparser.py3
-rw-r--r--bs4/element.py2
-rw-r--r--bs4/testing.py5
-rw-r--r--bs4/tests/test_lxml.py19
5 files changed, 32 insertions, 1 deletions
diff --git a/NEWS.txt b/NEWS.txt
index edbba28..c2739ca 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -17,6 +17,10 @@
* Fix a bug by which keyword arguments to find_parent() were not
being passed on. [bug=1126734]
+* Now that lxml's segfault on invalid doctype has been fixed, fix a
+ corresponding problem on the Beautiful Soup end that was previously
+ invisible. [bug=984936]
+
= 4.1.3 (20120820) =
* Skipped a test under Python 2.6 and Python 3.1 to avoid a spurious
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index ede5cec..e34c9fa 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -85,6 +85,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.soup.endData()
if data.startswith("DOCTYPE "):
data = data[len("DOCTYPE "):]
+ elif data == 'DOCTYPE':
+ # i.e. "<!DOCTYPE>"
+ data = ''
self.soup.handle_data(data)
self.soup.endData(Doctype)
diff --git a/bs4/element.py b/bs4/element.py
index d58da92..f38d9b4 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -779,7 +779,7 @@ class Doctype(PreformattedString):
@classmethod
def for_name_and_ids(cls, name, pub_id, system_id):
- value = name
+ value = name or ''
if pub_id is not None:
value += ' PUBLIC "%s"' % pub_id
if system_id is not None:
diff --git a/bs4/testing.py b/bs4/testing.py
index c9307d3..ed71d3b 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -81,6 +81,11 @@ class HTMLTreeBuilderSmokeTest(object):
self.assertDoctypeHandled(
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
+ def test_empty_doctype(self):
+ soup = self.soup("<!DOCTYPE>")
+ doctype = soup.contents[0]
+ self.assertEqual("", doctype.strip())
+
def test_public_doctype_with_url(self):
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
self.assertDoctypeHandled(doctype)
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index 39e26bf..693ec25 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -6,6 +6,14 @@ import warnings
try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True
+ import lxml.etree
+ LXML_VERSION = []
+ for i in lxml.etree.__version__.split('.'):
+ try:
+ part = int(i)
+ except TypeError:
+ part = 0
+ LXML_VERSION.append(part)
except ImportError, e:
LXML_PRESENT = False
@@ -41,6 +49,17 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
+ # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
+ # test if an old version of lxml is installed.
+
+ @skipIf(
+ LXML_VERSION < [2,3,5],
+ "Skipping doctype test for old version of lxml to avoid segfault.")
+ def test_empty_doctype(self):
+ soup = self.soup("<!DOCTYPE>")
+ doctype = soup.contents[0]
+ self.assertEqual("", doctype.strip())
+
def test_beautifulstonesoup_is_xml_parser(self):
# Make sure that the deprecated BSS class uses an xml builder
# if one is installed.