The html.parser tree builder now correctly handles DOCTYPEs that are

not uppercase. [bug=1848401]
author: Leonard Richardson <leonardr@segfault.org> 2019-11-11 13:51:41 -0500
committer: Leonard Richardson <leonardr@segfault.org> 2019-11-11 13:51:41 -0500
commit: 68e5565dd1be82b0f3e981abd8b5419f9d8258b8 (patch)
tree: 4ca6790585c1af05f2f269b6b4ab635fe90683e8
parent: 9b72457805dcf60e283bccd947fe4e88c79607a3 (diff)
4 files changed, 37 insertions, 9 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 450e8d1..921f639 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -3,6 +3,9 @@
 * Fixed a deprecation warning on Python 3.7. Patch by Colin
   Watson. [bug=1847592]
 
+* The html.parser tree builder now correctly handles DOCTYPEs that are
+  not uppercase. [bug=1848401]
+
 * Added a Chinese translation by Deron Wang and a Brazilian Portuguese
   translation by Cezar Peixeiro to the repository.
 
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index cd50eb0..6a076a1 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -187,9 +187,10 @@ class BeautifulSoupHTMLParser(HTMLParser):
 
     def handle_decl(self, data):
         self.soup.endData()
-        if data.startswith("DOCTYPE "):
-            data = data[len("DOCTYPE "):]
-        elif data == 'DOCTYPE':
+        doctype_len = len("DOCTYPE ")
+        if data[:doctype_len].lower() == "doctype ":
+            data = data[doctype_len:]
+        elif len(data) == doctype_len-1 and data.lower() == 'doctype':
             # i.e. "<!DOCTYPE>"
             data = ''
         self.soup.handle_data(data)
diff --git a/bs4/testing.py b/bs4/testing.py
index 9f12e8d..a162778 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -250,18 +250,21 @@ class HTMLTreeBuilderSmokeTest(object):
         doctype = soup.contents[0]
         self.assertEqual(doctype.__class__, Doctype)
         self.assertEqual(doctype, doctype_fragment)
-        self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
+        self.assertEqual(
+            soup.encode("utf8")[:len(doctype_str)],
+            doctype_str
+        )
 
         # Make sure that the doctype was correctly associated with the
         # parse tree and that the rest of the document parsed.
         self.assertEqual(soup.p.contents[0], 'foo')
 
-    def _document_with_doctype(self, doctype_fragment):
+    def _document_with_doctype(self, doctype_fragment, doctype_string="DOCTYPE"):
         """Generate and parse a document with the given doctype."""
-        doctype = '<!DOCTYPE %s>' % doctype_fragment
+        doctype = '<!%s %s>' % (doctype_string, doctype_fragment)
         markup = doctype + '\n<p>foo</p>'
         soup = self.soup(markup)
-        return doctype, soup
+        return doctype.encode("utf8"), soup
 
     def test_normal_doctypes(self):
         """Make sure normal, everyday HTML doctypes are handled correctly."""
@@ -274,6 +277,27 @@ class HTMLTreeBuilderSmokeTest(object):
         doctype = soup.contents[0]
         self.assertEqual("", doctype.strip())
 
+    def test_mixed_case_doctype(self):
+        # A lowercase or mixed-case doctype becomes a Doctype.
+        for doctype_fragment in ("doctype", "DocType"):
+            doctype_str, soup = self._document_with_doctype(
+                "html", doctype_fragment
+            )
+
+            # Make sure a Doctype object was created and that the DOCTYPE
+            # is uppercase.
+            doctype = soup.contents[0]
+            self.assertEqual(doctype.__class__, Doctype)
+            self.assertEqual(doctype, "html")
+            self.assertEqual(
+                soup.encode("utf8")[:len(doctype_str)],
+                b"<!DOCTYPE html>"
+            )
+
+            # Make sure that the doctype was correctly associated with the
+            # parse tree and that the rest of the document parsed.
+            self.assertEqual(soup.p.contents[0], 'foo')
+        
     def test_public_doctype_with_url(self):
         doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
         self.assertDoctypeHandled(doctype)
@@ -828,7 +852,7 @@ class XMLTreeBuilderSmokeTest(object):
         soup = self.soup(markup)
         self.assertEqual(
             soup.encode("utf-8"), markup)
-
+       
     def test_nested_namespaces(self):
         doc = b"""<?xml version="1.0" encoding="utf-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
diff --git a/doc/source/README b/doc/source/README
index 46478b2..11d808a 100644
--- a/doc/source/README
+++ b/doc/source/README
@@ -18,5 +18,5 @@ Peixeiro. The version in this repository has been modified from
 https://github.com/czrpxr/BeautifulSoup4-ptbr-translation.
 
 doc.zh/source/index.rst is a 2018 Chinese translation by Deron
-Wang. The version in this repository has been copied from
+Wang. The version in this repository has been modified from
 https://github.com/DeronW/beautifulsoup.
author	Leonard Richardson <leonardr@segfault.org>	2019-11-11 13:51:41 -0500
committer	Leonard Richardson <leonardr@segfault.org>	2019-11-11 13:51:41 -0500
commit	68e5565dd1be82b0f3e981abd8b5419f9d8258b8 (patch)
tree	4ca6790585c1af05f2f269b6b4ab635fe90683e8
parent	9b72457805dcf60e283bccd947fe4e88c79607a3 (diff)