summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-02-26 09:22:42 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2012-02-26 09:22:42 -0500
commitf224b8536ce266538bcfa492ec8d2b3b41fceae5 (patch)
tree30d431989c7a5b18da139c8a03d433d5ce2b119d
parent105aa2f9a9f833ff98c1706290b07e9228e008a6 (diff)
Fixed DOCTYPE handling.
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/element.py4
-rw-r--r--bs4/testing.py7
-rw-r--r--bs4/tests/test_soup.py33
4 files changed, 32 insertions, 15 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 0d50c80..af8fd16 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,5 +1,8 @@
= 4.0.0b9 () =
+* Fixed the string representation of DOCTYPEs that have both a public
+ ID and a system ID.
+
* Renamed Tag.nsprefix to Tag.prefix, for consistency with
NamespacedAttribute.
diff --git a/bs4/element.py b/bs4/element.py
index cdc9e36..e50f639 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -507,7 +507,9 @@ class Doctype(NavigableString):
value = name
if pub_id is not None:
value += ' PUBLIC "%s"' % pub_id
- if system_id is not None:
+ if system_id is not None:
+ value += ' "%s"' % system_id
+ elif system_id is not None:
value += ' SYSTEM "%s"' % system_id
return Doctype(value)
diff --git a/bs4/testing.py b/bs4/testing.py
index 13a7b5a..49644c3 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -79,6 +79,13 @@ class HTMLTreeBuilderSmokeTest(object):
self.assertDoctypeHandled(
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
+ def test_public_doctype_with_url(self):
+ doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
+ self.assertDoctypeHandled(doctype)
+
+ def test_system_doctype(self):
+ self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
+
def test_namespaced_system_doctype(self):
# We can handle a namespaced doctype with a system ID.
self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index d8584b7..10a7e55 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -7,6 +7,7 @@ from bs4.element import (
SoupStrainer,
NamespacedAttribute,
)
+import bs4.dammit
from bs4.dammit import EntitySubstitution, UnicodeDammit
from bs4.testing import (
SoupTest,
@@ -221,9 +222,6 @@ class TestUnicodeDammit(unittest.TestCase):
self.assertEqual(
"euc-jp", dammit.original_encoding)
- @skipIf(
- CHARDET_PRESENT,
- "Not testing last-ditch entity replacement because chardet is present and will find an encoding.")
def test_last_ditch_entity_replacement(self):
# This is a UTF-8 document that contains bytestrings
# completely incompatible with UTF-8 (ie. encoded with some other
@@ -238,20 +236,27 @@ class TestUnicodeDammit(unittest.TestCase):
# can be converted into ISO-8859-1 without errors. This happens
# to be the wrong encoding, but it is a consistent encoding, so the
# code we're testing here won't run.
+ #
+ # So we temporarily disable chardet if it's present.
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
<html><b>\330\250\330\252\330\261</b>
<i>\310\322\321\220\312\321\355\344</i></html>"""
- with warnings.catch_warnings(record=True) as w:
- dammit = UnicodeDammit(doc)
- self.assertEqual(True, dammit.contains_replacement_characters)
- self.assertTrue(u"\ufffd" in dammit.unicode_markup)
-
- soup = BeautifulSoup(doc, "html.parser")
- self.assertTrue(soup.contains_replacement_characters)
-
- msg = w[0].message
- self.assertTrue(isinstance(msg, UnicodeWarning))
- self.assertTrue("Some characters could not be decoded" in str(msg))
+ chardet = bs4.dammit.chardet
+ try:
+ bs4.dammit.chardet = None
+ with warnings.catch_warnings(record=True) as w:
+ dammit = UnicodeDammit(doc)
+ self.assertEqual(True, dammit.contains_replacement_characters)
+ self.assertTrue(u"\ufffd" in dammit.unicode_markup)
+
+ soup = BeautifulSoup(doc, "html.parser")
+ self.assertTrue(soup.contains_replacement_characters)
+
+ msg = w[0].message
+ self.assertTrue(isinstance(msg, UnicodeWarning))
+ self.assertTrue("Some characters could not be decoded" in str(msg))
+ finally:
+ bs4.dammit.chardet = chardet
class TestNamedspacedAttribute(SoupTest):