diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-26 09:22:42 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-26 09:22:42 -0500 |
commit | f224b8536ce266538bcfa492ec8d2b3b41fceae5 (patch) | |
tree | 30d431989c7a5b18da139c8a03d433d5ce2b119d | |
parent | 105aa2f9a9f833ff98c1706290b07e9228e008a6 (diff) |
Fixed DOCTYPE handling.
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/element.py | 4 | ||||
-rw-r--r-- | bs4/testing.py | 7 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 33 |
4 files changed, 32 insertions, 15 deletions
@@ -1,5 +1,8 @@ = 4.0.0b9 () = +* Fixed the string representation of DOCTYPEs that have both a public + ID and a system ID. + * Renamed Tag.nsprefix to Tag.prefix, for consistency with NamespacedAttribute. diff --git a/bs4/element.py b/bs4/element.py index cdc9e36..e50f639 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -507,7 +507,9 @@ class Doctype(NavigableString): value = name if pub_id is not None: value += ' PUBLIC "%s"' % pub_id - if system_id is not None: + if system_id is not None: + value += ' "%s"' % system_id + elif system_id is not None: value += ' SYSTEM "%s"' % system_id return Doctype(value) diff --git a/bs4/testing.py b/bs4/testing.py index 13a7b5a..49644c3 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -79,6 +79,13 @@ class HTMLTreeBuilderSmokeTest(object): self.assertDoctypeHandled( 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') + def test_public_doctype_with_url(self): + doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' + self.assertDoctypeHandled(doctype) + + def test_system_doctype(self): + self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') + def test_namespaced_system_doctype(self): # We can handle a namespaced doctype with a system ID. self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index d8584b7..10a7e55 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -7,6 +7,7 @@ from bs4.element import ( SoupStrainer, NamespacedAttribute, ) +import bs4.dammit from bs4.dammit import EntitySubstitution, UnicodeDammit from bs4.testing import ( SoupTest, @@ -221,9 +222,6 @@ class TestUnicodeDammit(unittest.TestCase): self.assertEqual( "euc-jp", dammit.original_encoding) - @skipIf( - CHARDET_PRESENT, - "Not testing last-ditch entity replacement because chardet is present and will find an encoding.") def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other @@ -238,20 +236,27 @@ class TestUnicodeDammit(unittest.TestCase): # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. + # + # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" - with warnings.catch_warnings(record=True) as w: - dammit = UnicodeDammit(doc) - self.assertEqual(True, dammit.contains_replacement_characters) - self.assertTrue(u"\ufffd" in dammit.unicode_markup) - - soup = BeautifulSoup(doc, "html.parser") - self.assertTrue(soup.contains_replacement_characters) - - msg = w[0].message - self.assertTrue(isinstance(msg, UnicodeWarning)) - self.assertTrue("Some characters could not be decoded" in str(msg)) + chardet = bs4.dammit.chardet + try: + bs4.dammit.chardet = None + with warnings.catch_warnings(record=True) as w: + dammit = UnicodeDammit(doc) + self.assertEqual(True, dammit.contains_replacement_characters) + self.assertTrue(u"\ufffd" in dammit.unicode_markup) + + soup = BeautifulSoup(doc, "html.parser") + self.assertTrue(soup.contains_replacement_characters) + + msg = w[0].message + self.assertTrue(isinstance(msg, UnicodeWarning)) + self.assertTrue("Some characters could not be decoded" in str(msg)) + finally: + bs4.dammit.chardet = chardet class TestNamedspacedAttribute(SoupTest): |