diff options
author | Leonard Richardson <leonardr@segfault.org> | 2019-11-11 13:51:41 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2019-11-11 13:51:41 -0500 |
commit | 68e5565dd1be82b0f3e981abd8b5419f9d8258b8 (patch) | |
tree | 4ca6790585c1af05f2f269b6b4ab635fe90683e8 | |
parent | 9b72457805dcf60e283bccd947fe4e88c79607a3 (diff) |
The html.parser tree builder now correctly handles DOCTYPEs that are
not uppercase. [bug=1848401]
-rw-r--r-- | CHANGELOG | 3 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 7 | ||||
-rw-r--r-- | bs4/testing.py | 34 | ||||
-rw-r--r-- | doc/source/README | 2 |
4 files changed, 37 insertions, 9 deletions
@@ -3,6 +3,9 @@ * Fixed a deprecation warning on Python 3.7. Patch by Colin Watson. [bug=1847592] +* The html.parser tree builder now correctly handles DOCTYPEs that are + not uppercase. [bug=1848401] + * Added a Chinese translation by Deron Wang and a Brazilian Portuguese translation by Cezar Peixeiro to the repository. diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index cd50eb0..6a076a1 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -187,9 +187,10 @@ class BeautifulSoupHTMLParser(HTMLParser): def handle_decl(self, data): self.soup.endData() - if data.startswith("DOCTYPE "): - data = data[len("DOCTYPE "):] - elif data == 'DOCTYPE': + doctype_len = len("DOCTYPE ") + if data[:doctype_len].lower() == "doctype ": + data = data[doctype_len:] + elif len(data) == doctype_len-1 and data.lower() == 'doctype': # i.e. "<!DOCTYPE>" data = '' self.soup.handle_data(data) diff --git a/bs4/testing.py b/bs4/testing.py index 9f12e8d..a162778 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -250,18 +250,21 @@ class HTMLTreeBuilderSmokeTest(object): doctype = soup.contents[0] self.assertEqual(doctype.__class__, Doctype) self.assertEqual(doctype, doctype_fragment) - self.assertEqual(str(soup)[:len(doctype_str)], doctype_str) + self.assertEqual( + soup.encode("utf8")[:len(doctype_str)], + doctype_str + ) # Make sure that the doctype was correctly associated with the # parse tree and that the rest of the document parsed. self.assertEqual(soup.p.contents[0], 'foo') - def _document_with_doctype(self, doctype_fragment): + def _document_with_doctype(self, doctype_fragment, doctype_string="DOCTYPE"): """Generate and parse a document with the given doctype.""" - doctype = '<!DOCTYPE %s>' % doctype_fragment + doctype = '<!%s %s>' % (doctype_string, doctype_fragment) markup = doctype + '\n<p>foo</p>' soup = self.soup(markup) - return doctype, soup + return doctype.encode("utf8"), soup def test_normal_doctypes(self): """Make sure normal, everyday HTML doctypes are handled correctly.""" @@ -274,6 +277,27 @@ class HTMLTreeBuilderSmokeTest(object): doctype = soup.contents[0] self.assertEqual("", doctype.strip()) + def test_mixed_case_doctype(self): + # A lowercase or mixed-case doctype becomes a Doctype. + for doctype_fragment in ("doctype", "DocType"): + doctype_str, soup = self._document_with_doctype( + "html", doctype_fragment + ) + + # Make sure a Doctype object was created and that the DOCTYPE + # is uppercase. + doctype = soup.contents[0] + self.assertEqual(doctype.__class__, Doctype) + self.assertEqual(doctype, "html") + self.assertEqual( + soup.encode("utf8")[:len(doctype_str)], + b"<!DOCTYPE html>" + ) + + # Make sure that the doctype was correctly associated with the + # parse tree and that the rest of the document parsed. + self.assertEqual(soup.p.contents[0], 'foo') + def test_public_doctype_with_url(self): doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' self.assertDoctypeHandled(doctype) @@ -828,7 +852,7 @@ class XMLTreeBuilderSmokeTest(object): soup = self.soup(markup) self.assertEqual( soup.encode("utf-8"), markup) - + def test_nested_namespaces(self): doc = b"""<?xml version="1.0" encoding="utf-8"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> diff --git a/doc/source/README b/doc/source/README index 46478b2..11d808a 100644 --- a/doc/source/README +++ b/doc/source/README @@ -18,5 +18,5 @@ Peixeiro. The version in this repository has been modified from https://github.com/czrpxr/BeautifulSoup4-ptbr-translation. doc.zh/source/index.rst is a 2018 Chinese translation by Deron -Wang. The version in this repository has been copied from +Wang. The version in this repository has been modified from https://github.com/DeronW/beautifulsoup. |