summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt11
-rw-r--r--bs4/__init__.py4
-rw-r--r--bs4/builder/_htmlparser.py3
-rw-r--r--bs4/element.py2
-rw-r--r--bs4/testing.py5
-rw-r--r--bs4/tests/test_lxml.py13
6 files changed, 37 insertions, 1 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 781a514..f714bfe 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -8,6 +8,13 @@
you strings that are visible in the document--no comments or
processing commands. [bug=1050164]
+* The BeautifulSoup class is now aliased to "_s" and "_soup", making
+ it quicker to type an import statement in an interactive session:
+
+ from bs4 import _s
+ or
+ from bs4 import _soup
+
* Fix a bug in the html5lib treebuilder which sometimes created
disconnected trees. [bug=1039527]
@@ -20,6 +27,10 @@
* Stop a crash when unwisely messing with a tag that's been
decomposed. [bug=1097699]
+* Now that lxml's segfault on invalid doctype has been fixed, fix a
+ corresponding problem on the Beautiful Soup end that was previously
+ invisible. [bug=984936]
+
= 4.1.3 (20120820) =
* Skipped a test under Python 2.6 and Python 3.1 to avoid a spurious
diff --git a/bs4/__init__.py b/bs4/__init__.py
index fe2656b..88177d6 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -335,6 +335,10 @@ class BeautifulSoup(Tag):
return prefix + super(BeautifulSoup, self).decode(
indent_level, eventual_encoding, formatter)
+# Alias to make it easier to type import: 'from bs4 import _soup'
+_s = BeautifulSoup
+_soup = BeautifulSoup
+
class BeautifulStoneSoup(BeautifulSoup):
"""Deprecated interface to an XML parser."""
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index ede5cec..e34c9fa 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -85,6 +85,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.soup.endData()
if data.startswith("DOCTYPE "):
data = data[len("DOCTYPE "):]
+ elif data == 'DOCTYPE':
+ # i.e. "<!DOCTYPE>"
+ data = ''
self.soup.handle_data(data)
self.soup.endData(Doctype)
diff --git a/bs4/element.py b/bs4/element.py
index c081eba..5ccb019 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -779,7 +779,7 @@ class Doctype(PreformattedString):
@classmethod
def for_name_and_ids(cls, name, pub_id, system_id):
- value = name
+ value = name or ''
if pub_id is not None:
value += ' PUBLIC "%s"' % pub_id
if system_id is not None:
diff --git a/bs4/testing.py b/bs4/testing.py
index c9307d3..ed71d3b 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -81,6 +81,11 @@ class HTMLTreeBuilderSmokeTest(object):
self.assertDoctypeHandled(
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
+ def test_empty_doctype(self):
+ soup = self.soup("<!DOCTYPE>")
+ doctype = soup.contents[0]
+ self.assertEqual("", doctype.strip())
+
def test_public_doctype_with_url(self):
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
self.assertDoctypeHandled(doctype)
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index 39e26bf..f32fc2b 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -6,6 +6,8 @@ import warnings
try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True
+ import lxml.etree
+ LXML_VERSION = lxml.etree.LXML_VERSION
except ImportError, e:
LXML_PRESENT = False
@@ -41,6 +43,17 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
+ # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
+ # test if an old version of lxml is installed.
+
+ @skipIf(
+ LXML_VERSION < (2,3,5,0),
+ "Skipping doctype test for old version of lxml to avoid segfault.")
+ def test_empty_doctype(self):
+ soup = self.soup("<!DOCTYPE>")
+ doctype = soup.contents[0]
+ self.assertEqual("", doctype.strip())
+
def test_beautifulstonesoup_is_xml_parser(self):
# Make sure that the deprecated BSS class uses an xml builder
# if one is installed.