Fixed DOCTYPE handling.

author: Leonard Richardson <leonard.richardson@canonical.com> 2012-02-26 09:22:42 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2012-02-26 09:22:42 -0500
commit: f224b8536ce266538bcfa492ec8d2b3b41fceae5 (patch)
tree: 30d431989c7a5b18da139c8a03d433d5ce2b119d
parent: 105aa2f9a9f833ff98c1706290b07e9228e008a6 (diff)
4 files changed, 32 insertions, 15 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 0d50c80..af8fd16 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,5 +1,8 @@
 = 4.0.0b9 () =
 
+* Fixed the string representation of DOCTYPEs that have both a public
+  ID and a system ID.
+
 * Renamed Tag.nsprefix to Tag.prefix, for consistency with
   NamespacedAttribute.
 
diff --git a/bs4/element.py b/bs4/element.py
index cdc9e36..e50f639 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -507,7 +507,9 @@ class Doctype(NavigableString):
         value = name
         if pub_id is not None:
             value += ' PUBLIC "%s"' % pub_id
-        if system_id is not None:
+            if system_id is not None:
+                value += ' "%s"' % system_id
+        elif system_id is not None:
             value += ' SYSTEM "%s"' % system_id
 
         return Doctype(value)
diff --git a/bs4/testing.py b/bs4/testing.py
index 13a7b5a..49644c3 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -79,6 +79,13 @@ class HTMLTreeBuilderSmokeTest(object):
         self.assertDoctypeHandled(
             'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
 
+    def test_public_doctype_with_url(self):
+        doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
+        self.assertDoctypeHandled(doctype)
+
+    def test_system_doctype(self):
+        self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
+
     def test_namespaced_system_doctype(self):
         # We can handle a namespaced doctype with a system ID.
         self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index d8584b7..10a7e55 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -7,6 +7,7 @@ from bs4.element import (
     SoupStrainer,
     NamespacedAttribute,
     )
+import bs4.dammit
 from bs4.dammit import EntitySubstitution, UnicodeDammit
 from bs4.testing import (
     SoupTest,
@@ -221,9 +222,6 @@ class TestUnicodeDammit(unittest.TestCase):
             self.assertEqual(
                 "euc-jp", dammit.original_encoding)
 
-    @skipIf(
-        CHARDET_PRESENT,
-        "Not testing last-ditch entity replacement because chardet is present and will find an encoding.")
     def test_last_ditch_entity_replacement(self):
         # This is a UTF-8 document that contains bytestrings
         # completely incompatible with UTF-8 (ie. encoded with some other
@@ -238,20 +236,27 @@ class TestUnicodeDammit(unittest.TestCase):
         # can be converted into ISO-8859-1 without errors. This happens
         # to be the wrong encoding, but it is a consistent encoding, so the
         # code we're testing here won't run.
+        #
+        # So we temporarily disable chardet if it's present.
         doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
 <html><b>\330\250\330\252\330\261</b>
 <i>\310\322\321\220\312\321\355\344</i></html>"""
-        with warnings.catch_warnings(record=True) as w:
-            dammit = UnicodeDammit(doc)
-            self.assertEqual(True, dammit.contains_replacement_characters)
-            self.assertTrue(u"\ufffd" in dammit.unicode_markup)
-
-            soup = BeautifulSoup(doc, "html.parser")
-            self.assertTrue(soup.contains_replacement_characters)
-
-            msg = w[0].message
-            self.assertTrue(isinstance(msg, UnicodeWarning))
-            self.assertTrue("Some characters could not be decoded" in str(msg))
+        chardet = bs4.dammit.chardet
+        try:
+            bs4.dammit.chardet = None
+            with warnings.catch_warnings(record=True) as w:
+                dammit = UnicodeDammit(doc)
+                self.assertEqual(True, dammit.contains_replacement_characters)
+                self.assertTrue(u"\ufffd" in dammit.unicode_markup)
+
+                soup = BeautifulSoup(doc, "html.parser")
+                self.assertTrue(soup.contains_replacement_characters)
+
+                msg = w[0].message
+                self.assertTrue(isinstance(msg, UnicodeWarning))
+                self.assertTrue("Some characters could not be decoded" in str(msg))
+        finally:
+            bs4.dammit.chardet = chardet
 
 
 class TestNamedspacedAttribute(SoupTest):
author	Leonard Richardson <leonard.richardson@canonical.com>	2012-02-26 09:22:42 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2012-02-26 09:22:42 -0500
commit	f224b8536ce266538bcfa492ec8d2b3b41fceae5 (patch)
tree	30d431989c7a5b18da139c8a03d433d5ce2b119d
parent	105aa2f9a9f833ff98c1706290b07e9228e008a6 (diff)