Issue a warning when an HTML parser is used to parse a document that

looks like XML but not XHTML. [bug=1939121]
author: Leonard Richardson <leonardr@segfault.org> 2021-10-24 21:15:31 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2021-10-24 21:15:31 -0400
commit: c1a7aaae7140897b2e845be8c5aa077d6654ee0a (patch)
tree: df6a58adc912d111e619094d7884d034a6649249
parent: dd8aa7237b88569c99e85b300b0cf537aeaebfbd (diff)
8 files changed, 179 insertions, 20 deletions
diff --git a/CHANGELOG b/CHANGELOG
index e3f9167..4470f64 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -15,16 +15,19 @@ Python 2 was revision 605.
   html5lib parser. [bug=1948488]
 
 * Added a workaround for an lxml bug
-  (https://bugs.launchpad.net/lxml/+bug/1948551) that caused
+  (https://bugs.launchpad.net/lxml/+bug/1948551) that causes
   problems when parsing a Unicode string beginning with BYTE ORDER MARK.
   [bug=1947768]
 
+* Issue a warning when an HTML parser is used to parse a document that
+  looks like XML but not XHTML. [bug=1939121]
+
 * Some time ago, the misleadingly named "text" argument to find-type
   methods was renamed to the more accurate "string." But this supposed
   "renaming" didn't make it into important places like the method
   signatures or the docstrings. That's corrected in this
-  version. "text" still works, but will give a deprecation
-  warning. [bug=1947038]
+  version. "text" still works, but will give a DeprecationWarning.
+  [bug=1947038]
 
 = 4.10.0 (20210907)
 
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 2cdfed5..49e05e7 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -35,7 +35,11 @@ import warnings
 if sys.version_info.major < 3:
     raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.')
 
-from .builder import builder_registry, ParserRejectedMarkup
+from .builder import (
+    builder_registry,
+    ParserRejectedMarkup,
+    XMLParsedAsHTMLWarning,
+)
 from .dammit import UnicodeDammit
 from .element import (
     CData,
@@ -67,7 +71,7 @@ class MarkupResemblesLocatorWarning(UserWarning):
     on disk.
     """
 
-
+   
 class BeautifulSoup(Tag):
     """A data structure representing a parsed HTML or XML document.
 
@@ -735,7 +739,7 @@ class BeautifulSoup(Tag):
         #print("End tag: " + name)
         self.endData()
         self._popToTag(name, nsprefix)
-
+        
     def handle_data(self, data):
         """Called by the tree builder when a chunk of textual data is encountered."""
         self.current_data.append(data)
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index fa802f4..9f789f3 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -3,6 +3,8 @@ __license__ = "MIT"
 
 from collections import defaultdict
 import itertools
+import re
+import warnings
 import sys
 from bs4.element import (
     CharsetMetaAttributeValue,
@@ -30,6 +32,12 @@ XML = 'xml'
 HTML = 'html'
 HTML_5 = 'html5'
 
+class XMLParsedAsHTMLWarning(UserWarning):
+    """The warning issued when an HTML parser is used to parse
+    XML that is not XHTML.
+    """
+    MESSAGE = """It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor."""
+
 
 class TreeBuilderRegistry(object):
     """A way of looking up TreeBuilder subclasses by their name or by desired
@@ -441,7 +449,7 @@ class HTMLTreeBuilder(TreeBuilder):
         }
 
     DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
-    
+
     def set_up_substitutions(self, tag):
         """Replace the declared encoding in a <meta> tag with a placeholder,
         to be substituted when the tag is output to a string.
@@ -485,6 +493,99 @@ class HTMLTreeBuilder(TreeBuilder):
 
         return (meta_encoding is not None)
 
+class DetectsXMLParsedAsHTML(object):
+    """A mixin class for any class (a TreeBuilder, or some class used by a
+    TreeBuilder) that's in a position to detect whether an XML
+    document is being incorrectly parsed as HTML, and issue an
+    appropriate warning.
+
+    This requires being able to observe an incoming processing
+    instruction that might be an XML declaration, and also able to
+    observe tags as they're opened. If you can't do that for a given
+    TreeBuilder, there's a less reliable implementation based on
+    examining the raw markup.
+    """
+
+    # Regular expression for seeing if markup has an <html> tag.
+    LOOKS_LIKE_HTML = re.compile("<[^ +]html", re.I)
+    LOOKS_LIKE_HTML_B = re.compile(b"<[^ +]html", re.I)
+
+    XML_PREFIX = '<?xml'
+    XML_PREFIX_B = b'<?xml'
+    
+    @classmethod
+    def warn_if_markup_looks_like_xml(cls, markup):
+        """Perform a check on some markup to see if it looks like XML
+        that's not XHTML. If so, issue a warning.
+
+        This is much less reliable than doing the check while parsing,
+        but some of the tree builders can't do that.
+
+        :return: True if the markup looks like non-XHTML XML, False
+        otherwise.
+        """
+        if isinstance(markup, bytes):
+            prefix = cls.XML_PREFIX_B
+            looks_like_html = cls.LOOKS_LIKE_HTML_B
+        else:
+            prefix = cls.XML_PREFIX
+            looks_like_html = cls.LOOKS_LIKE_HTML
+        
+        if (markup is not None
+            and markup.startswith(prefix)
+            and not looks_like_html.search(markup[:500])
+        ):
+            cls._warn()
+            return True
+        return False
+
+    @classmethod
+    def _warn(cls):
+        """Issue a warning about XML being parsed as HTML."""
+        warnings.warn(
+            XMLParsedAsHTMLWarning.MESSAGE, XMLParsedAsHTMLWarning
+        )
+        
+    def _initialize_xml_detector(self):
+        """Call this method before parsing a document."""
+        self._first_processing_instruction = None
+        self._root_tag = None
+       
+    def _document_might_be_xml(self, processing_instruction):
+        """Call this method when encountering an XML declaration, or a
+        "processing instruction" that might be an XML declaration.
+        """
+        if (self._first_processing_instruction is not None
+            or self._root_tag is not None):
+            # The document has already started. Don't bother checking
+            # anymore.
+            return
+
+        self._first_processing_instruction = processing_instruction
+
+        # We won't know until we encounter the first tag whether or
+        # not this is actually a problem.
+        
+    def _root_tag_encountered(self, name):
+        """Call this when you encounter the document's root tag.
+
+        This is where we actually check whether an XML document is
+        being incorrectly parsed as HTML, and issue the warning.
+        """
+        if self._root_tag is not None:
+            # This method was incorrectly called multiple times. Do
+            # nothing.
+            return
+
+        self._root_tag = name
+        if (name != 'html' and self._first_processing_instruction is not None
+            and self._first_processing_instruction.lower().startswith('xml ')):
+            # We encountered an XML declaration and then a tag other
+            # than 'html'. This is a reliable indicator that a
+            # non-XHTML document is being parsed as XML.
+            self._warn()
+
+    
 def register_treebuilders_from(module):
     """Copy TreeBuilders from the given module into this module."""
     this_module = sys.modules[__name__]
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 914b1df..58bc176 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -8,6 +8,7 @@ __all__ = [
 import warnings
 import re
 from bs4.builder import (
+    DetectsXMLParsedAsHTML,
     PERMISSIVE,
     HTML,
     HTML_5,
@@ -70,6 +71,11 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
         # UnicodeDammit.
         if exclude_encodings:
             warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
+
+        # html5lib only parses HTML, so if it's given XML that's worth
+        # noting.
+        DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup)
+
         yield (markup, None, None, False)
 
     # These methods are defined by Beautiful Soup.
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 70e9be8..fae4d0f 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -44,6 +44,7 @@ from bs4.element import (
 from bs4.dammit import EntitySubstitution, UnicodeDammit
 
 from bs4.builder import (
+    DetectsXMLParsedAsHTML,
     HTML,
     HTMLTreeBuilder,
     STRICT,
@@ -52,7 +53,7 @@ from bs4.builder import (
 
 HTMLPARSER = 'html.parser'
 
-class BeautifulSoupHTMLParser(HTMLParser):
+class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
     """A subclass of the Python standard library's HTMLParser class, which
     listens for HTMLParser events and translates them into calls
     to Beautiful Soup's tree construction API.
@@ -88,6 +89,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
         # will ignore, assuming they ever show up.
         self.already_closed_empty_element = []
 
+        self._initialize_xml_detector()
+        
     def error(self, msg):
         """In Python 3, HTMLParser subclasses must implement error(), although
         this requirement doesn't appear to be documented.
@@ -167,6 +170,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
             # But we might encounter an explicit closing tag for this tag
             # later on. If so, we want to ignore it.
             self.already_closed_empty_element.append(name)
+
+        if self._root_tag is None:
+            self._root_tag_encountered(name)
             
     def handle_endtag(self, name, check_already_closed=True):
         """Handle a closing tag, e.g. '</tag>'
@@ -185,7 +191,7 @@ class BeautifulSoupHTMLParser(HTMLParser):
             self.already_closed_empty_element.remove(name)
         else:
             self.soup.handle_endtag(name)
-
+            
     def handle_data(self, data):
         """Handle some textual data that shows up between tags."""
         self.soup.handle_data(data)
@@ -288,6 +294,7 @@ class BeautifulSoupHTMLParser(HTMLParser):
         """
         self.soup.endData()
         self.soup.handle_data(data)
+        self._document_might_be_xml(data)
         self.soup.endData(ProcessingInstruction)
 
 
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 1334f94..d8251b2 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -22,6 +22,7 @@ from bs4.element import (
     XMLProcessingInstruction,
 )
 from bs4.builder import (
+    DetectsXMLParsedAsHTML,
     FAST,
     HTML,
     HTMLTreeBuilder,
@@ -166,6 +167,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
         is_html = not self.is_xml
         if is_html:
             self.processing_instruction_class = ProcessingInstruction
+            # We're in HTML mode, so if we're given XML, that's worth
+            # noting.
+            DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup)
         else:
             self.processing_instruction_class = XMLProcessingInstruction
 
@@ -271,7 +275,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
         namespace, name = self._getNsTag(name)
         nsprefix = self._prefix_for_namespace(namespace)
         self.soup.handle_starttag(name, namespace, nsprefix, attrs)
-
+      
     def _prefix_for_namespace(self, namespace):
         """Find the currently active prefix for the given namespace."""
         if namespace is None:
@@ -299,9 +303,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
 
     def pi(self, target, data):
         self.soup.endData()
-        self.soup.handle_data(target + ' ' + data)
+        data = target + ' ' + data
+        self.soup.handle_data(data)
         self.soup.endData(self.processing_instruction_class)
-
+        
     def data(self, content):
         self.soup.handle_data(content)
 
diff --git a/bs4/tests/__init__.py b/bs4/tests/__init__.py
index 224c9d8..4af4b0c 100644
--- a/bs4/tests/__init__.py
+++ b/bs4/tests/__init__.py
@@ -22,7 +22,11 @@ from bs4.element import (
     Tag
 )
 
-from bs4.builder import HTMLParserTreeBuilder
+from bs4.builder import (
+    DetectsXMLParsedAsHTML,
+    HTMLParserTreeBuilder,
+    XMLParsedAsHTMLWarning,
+)
 default_builder = HTMLParserTreeBuilder
 
 BAD_DOCUMENT = """A bare string
@@ -422,16 +426,43 @@ class HTMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
 <head><title>Hello.</title></head>
 <body>Goodbye.</body>
 </html>"""
-        soup = self.soup(markup)
+        with warnings.catch_warnings(record=True) as w:
+            soup = self.soup(markup)
         assert soup.encode("utf-8").replace(b"\n", b"") == markup.replace(b"\n", b"")
 
+        # No warning was issued about parsing an XML document as HTML,
+        # because XHTML is both.
+        assert w == []
+
+
     def test_namespaced_html(self):
-        """When a namespaced XML document is parsed as HTML it should
-        be treated as HTML with weird tag names.
-        """
+        # When a namespaced XML document is parsed as HTML it should
+        # be treated as HTML with weird tag names.
         markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>"""
-        soup = self.soup(markup)
+        with warnings.catch_warnings(record=True) as w:
+            soup = self.soup(markup)
+
         assert 2 == len(soup.find_all("ns1:foo"))
+            
+        # n.b. no "you're parsing XML as HTML" warning was given
+        # because there was no XML declaration.
+        assert [] == w
+
+    def test_detect_xml_parsed_as_html(self):
+        # A warning is issued when parsing an XML document as HTML,
+        # but basic stuff should still work.
+        markup = b"""<?xml version="1.0" encoding="utf-8"?><tag>string</tag>"""
+        with warnings.catch_warnings(record=True) as w:
+            soup = self.soup(markup)
+            assert soup.tag.string == 'string'
+        [warning] = w
+        assert isinstance(warning.message, XMLParsedAsHTMLWarning)
+        assert str(warning.message) == XMLParsedAsHTMLWarning.MESSAGE
+
+        # NOTE: the warning is not issued if the document appears to
+        # be XHTML (tested with test_real_xhtml_document in the
+        # superclass) or if there is no XML declaration (tested with
+        # test_namespaced_html in the superclass).
         
     def test_processing_instruction(self):
         # We test both Unicode and bytestring to verify that
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index 5912cf5..bfcfa1f 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -4,7 +4,10 @@ trees."""
 from pdb import set_trace
 import pickle
 import warnings
-from bs4.builder import HTMLParserTreeBuilder
+from bs4.builder import (
+    HTMLParserTreeBuilder,
+    XMLParsedAsHTMLWarning,
+)
 from bs4.builder._htmlparser import BeautifulSoupHTMLParser
 from . import SoupTest, HTMLTreeBuilderSmokeTest
 
@@ -120,7 +123,6 @@ class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
             expect = b"<div>%s</div>" % output_element
             assert with_element == expect
 
-
 class TestHTMLParserSubclass(SoupTest):
     def test_error(self):
         """Verify that our HTMLParser subclass implements error() in a way
author	Leonard Richardson <leonardr@segfault.org>	2021-10-24 21:15:31 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2021-10-24 21:15:31 -0400
commit	c1a7aaae7140897b2e845be8c5aa077d6654ee0a (patch)
tree	df6a58adc912d111e619094d7884d034a6649249
parent	dd8aa7237b88569c99e85b300b0cf537aeaebfbd (diff)