From c1a7aaae7140897b2e845be8c5aa077d6654ee0a Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonardr@segfault.org>
Date: Sun, 24 Oct 2021 21:15:31 -0400
Subject: Issue a warning when an HTML parser is used to parse a document that 
  looks like XML but not XHTML. [bug=1939121]

---
 bs4/builder/_htmlparser.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'bs4/builder/_htmlparser.py')
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 70e9be8..fae4d0f 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -44,6 +44,7 @@ from bs4.element import (
 from bs4.dammit import EntitySubstitution, UnicodeDammit
 
 from bs4.builder import (
+    DetectsXMLParsedAsHTML,
     HTML,
     HTMLTreeBuilder,
     STRICT,
@@ -52,7 +53,7 @@ from bs4.builder import (
 
 HTMLPARSER = 'html.parser'
 
-class BeautifulSoupHTMLParser(HTMLParser):
+class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
     """A subclass of the Python standard library's HTMLParser class, which
     listens for HTMLParser events and translates them into calls
     to Beautiful Soup's tree construction API.
@@ -88,6 +89,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
         # will ignore, assuming they ever show up.
         self.already_closed_empty_element = []
 
+        self._initialize_xml_detector()
+        
     def error(self, msg):
         """In Python 3, HTMLParser subclasses must implement error(), although
         this requirement doesn't appear to be documented.
@@ -167,6 +170,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
             # But we might encounter an explicit closing tag for this tag
             # later on. If so, we want to ignore it.
             self.already_closed_empty_element.append(name)
+
+        if self._root_tag is None:
+            self._root_tag_encountered(name)
             
     def handle_endtag(self, name, check_already_closed=True):
         """Handle a closing tag, e.g. '</tag>'
@@ -185,7 +191,7 @@ class BeautifulSoupHTMLParser(HTMLParser):
             self.already_closed_empty_element.remove(name)
         else:
             self.soup.handle_endtag(name)
-
+            
     def handle_data(self, data):
         """Handle some textual data that shows up between tags."""
         self.soup.handle_data(data)
@@ -288,6 +294,7 @@ class BeautifulSoupHTMLParser(HTMLParser):
         """
         self.soup.endData()
         self.soup.handle_data(data)
+        self._document_might_be_xml(data)
         self.soup.endData(ProcessingInstruction)
 
 
-- 
cgit v1.2.3