summaryrefslogtreecommitdiff
path: root/bs4/tests
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/tests')
-rw-r--r--bs4/tests/__init__.py43
-rw-r--r--bs4/tests/test_htmlparser.py6
2 files changed, 41 insertions, 8 deletions
diff --git a/bs4/tests/__init__.py b/bs4/tests/__init__.py
index 224c9d8..4af4b0c 100644
--- a/bs4/tests/__init__.py
+++ b/bs4/tests/__init__.py
@@ -22,7 +22,11 @@ from bs4.element import (
Tag
)
-from bs4.builder import HTMLParserTreeBuilder
+from bs4.builder import (
+ DetectsXMLParsedAsHTML,
+ HTMLParserTreeBuilder,
+ XMLParsedAsHTMLWarning,
+)
default_builder = HTMLParserTreeBuilder
BAD_DOCUMENT = """A bare string
@@ -422,16 +426,43 @@ class HTMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
<head><title>Hello.</title></head>
<body>Goodbye.</body>
</html>"""
- soup = self.soup(markup)
+ with warnings.catch_warnings(record=True) as w:
+ soup = self.soup(markup)
assert soup.encode("utf-8").replace(b"\n", b"") == markup.replace(b"\n", b"")
+ # No warning was issued about parsing an XML document as HTML,
+ # because XHTML is both.
+ assert w == []
+
+
def test_namespaced_html(self):
- """When a namespaced XML document is parsed as HTML it should
- be treated as HTML with weird tag names.
- """
+ # When a namespaced XML document is parsed as HTML it should
+ # be treated as HTML with weird tag names.
markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>"""
- soup = self.soup(markup)
+ with warnings.catch_warnings(record=True) as w:
+ soup = self.soup(markup)
+
assert 2 == len(soup.find_all("ns1:foo"))
+
+ # n.b. no "you're parsing XML as HTML" warning was given
+ # because there was no XML declaration.
+ assert [] == w
+
+ def test_detect_xml_parsed_as_html(self):
+ # A warning is issued when parsing an XML document as HTML,
+ # but basic stuff should still work.
+ markup = b"""<?xml version="1.0" encoding="utf-8"?><tag>string</tag>"""
+ with warnings.catch_warnings(record=True) as w:
+ soup = self.soup(markup)
+ assert soup.tag.string == 'string'
+ [warning] = w
+ assert isinstance(warning.message, XMLParsedAsHTMLWarning)
+ assert str(warning.message) == XMLParsedAsHTMLWarning.MESSAGE
+
+ # NOTE: the warning is not issued if the document appears to
+ # be XHTML (tested with test_real_xhtml_document in the
+ # superclass) or if there is no XML declaration (tested with
+ # test_namespaced_html in the superclass).
def test_processing_instruction(self):
# We test both Unicode and bytestring to verify that
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index 5912cf5..bfcfa1f 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -4,7 +4,10 @@ trees."""
from pdb import set_trace
import pickle
import warnings
-from bs4.builder import HTMLParserTreeBuilder
+from bs4.builder import (
+ HTMLParserTreeBuilder,
+ XMLParsedAsHTMLWarning,
+)
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
from . import SoupTest, HTMLTreeBuilderSmokeTest
@@ -120,7 +123,6 @@ class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
expect = b"<div>%s</div>" % output_element
assert with_element == expect
-
class TestHTMLParserSubclass(SoupTest):
def test_error(self):
"""Verify that our HTMLParser subclass implements error() in a way