From c1a7aaae7140897b2e845be8c5aa077d6654ee0a Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sun, 24 Oct 2021 21:15:31 -0400 Subject: Issue a warning when an HTML parser is used to parse a document that looks like XML but not XHTML. [bug=1939121] --- bs4/builder/_htmlparser.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'bs4/builder/_htmlparser.py') diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 70e9be8..fae4d0f 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -44,6 +44,7 @@ from bs4.element import ( from bs4.dammit import EntitySubstitution, UnicodeDammit from bs4.builder import ( + DetectsXMLParsedAsHTML, HTML, HTMLTreeBuilder, STRICT, @@ -52,7 +53,7 @@ from bs4.builder import ( HTMLPARSER = 'html.parser' -class BeautifulSoupHTMLParser(HTMLParser): +class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML): """A subclass of the Python standard library's HTMLParser class, which listens for HTMLParser events and translates them into calls to Beautiful Soup's tree construction API. @@ -88,6 +89,8 @@ class BeautifulSoupHTMLParser(HTMLParser): # will ignore, assuming they ever show up. self.already_closed_empty_element = [] + self._initialize_xml_detector() + def error(self, msg): """In Python 3, HTMLParser subclasses must implement error(), although this requirement doesn't appear to be documented. @@ -167,6 +170,9 @@ class BeautifulSoupHTMLParser(HTMLParser): # But we might encounter an explicit closing tag for this tag # later on. If so, we want to ignore it. self.already_closed_empty_element.append(name) + + if self._root_tag is None: + self._root_tag_encountered(name) def handle_endtag(self, name, check_already_closed=True): """Handle a closing tag, e.g. '' @@ -185,7 +191,7 @@ class BeautifulSoupHTMLParser(HTMLParser): self.already_closed_empty_element.remove(name) else: self.soup.handle_endtag(name) - + def handle_data(self, data): """Handle some textual data that shows up between tags.""" self.soup.handle_data(data) @@ -288,6 +294,7 @@ class BeautifulSoupHTMLParser(HTMLParser): """ self.soup.endData() self.soup.handle_data(data) + self._document_might_be_xml(data) self.soup.endData(ProcessingInstruction) -- cgit v1.2.3