summaryrefslogtreecommitdiff
path: root/bs4
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-04-18 08:45:51 -0400
committerLeonard Richardson <leonard.richardson@canonical.com>2012-04-18 08:45:51 -0400
commit3d0ae02cc3d0b947ef6102b31f4b354eec9b543a (patch)
treee4ee65c0f4d9c32c6bf738a3524c66b6a5737d40 /bs4
parent4a587ff6996a2192944d7fec341180c2a116ea17 (diff)
Fixed a bug that made the HTMLParser treebuilder generate XML definitions ending with two question marks instead of one. [bug=984258]
Diffstat (limited to 'bs4')
-rw-r--r--bs4/builder/_htmlparser.py8
-rw-r--r--bs4/testing.py36
-rw-r--r--bs4/tests/test_htmlparser.py1
-rw-r--r--bs4/tests/test_lxml.py15
4 files changed, 51 insertions, 9 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index c307ff8..3dee51b 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -96,6 +96,14 @@ class BeautifulSoupHTMLParser(HTMLParser):
def handle_pi(self, data):
self.soup.endData()
+ if data.endswith("?") and data.lower().startswith("xml"):
+ # "An XHTML processing instruction using the trailing '?'
+ # will cause the '?' to be included in data." - HTMLParser
+ # docs.
+ #
+ # Strip the question mark so we don't end up with two
+ # question marks.
+ data = data[:-1]
self.soup.handle_data(data)
self.soup.endData(ProcessingInstruction)
diff --git a/bs4/testing.py b/bs4/testing.py
index e9c505c..41c8783 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -94,6 +94,19 @@ class HTMLTreeBuilderSmokeTest(object):
# Test a namespaced doctype with a public id.
self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
+ def test_real_xhtml_document(self):
+ """A real XHTML document should come out more or less the same as it went in."""
+ markup = b"""<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>Hello.</title></head>
+<body>Goodbye.</body>
+</html>"""
+ soup = self.soup(markup)
+ self.assertEqual(
+ soup.encode("utf-8").replace(b"\n", b""),
+ markup.replace(b"\n", b""))
+
def test_deepcopy(self):
"""Make sure you can copy the tree builder.
@@ -393,14 +406,8 @@ class XMLTreeBuilderSmokeTest(object):
self.assertEqual(
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
- def test_docstring_includes_correct_encoding(self):
- soup = self.soup("<root/>")
- self.assertEqual(
- soup.encode("latin1"),
- b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
-
def test_real_xhtml_document(self):
- """A real XHTML document should come out the same as it went in."""
+ """A real XHTML document should come out *exactly* the same as it went in."""
markup = b"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
<html xmlns="http://www.w3.org/1999/xhtml">
@@ -408,7 +415,15 @@ class XMLTreeBuilderSmokeTest(object):
<body>Goodbye.</body>
</html>"""
soup = self.soup(markup)
- self.assertEqual(soup.encode("utf-8"), markup)
+ self.assertEqual(
+ soup.encode("utf-8"), markup)
+
+
+ def test_docstring_includes_correct_encoding(self):
+ soup = self.soup("<root/>")
+ self.assertEqual(
+ soup.encode("latin1"),
+ b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
def test_large_xml_document(self):
"""A large XML document should come out the same as it went in."""
@@ -434,6 +449,11 @@ class XMLTreeBuilderSmokeTest(object):
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5."""
+ def test_real_xhtml_document(self):
+ # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
+ # XHTML documents in any particular way.
+ pass
+
def test_html_tags_have_namespace(self):
markup = "<a>"
soup = self.soup(markup)
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index 6215185..bcb5ed2 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -17,4 +17,3 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
def test_namespaced_public_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass
-
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index 4e0b12e..39e26bf 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -48,6 +48,21 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
soup = BeautifulStoneSoup("<b />")
self.assertEqual(u"<b/>", unicode(soup.b))
+ def test_real_xhtml_document(self):
+ """lxml strips the XML definition from an XHTML doc, which is fine."""
+ markup = b"""<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>Hello.</title></head>
+<body>Goodbye.</body>
+</html>"""
+ soup = self.soup(markup)
+ self.assertEqual(
+ soup.encode("utf-8").replace(b"\n", b''),
+ markup.replace(b'\n', b'').replace(
+ b'<?xml version="1.0" encoding="utf-8"?>', b''))
+
+
@skipIf(
not LXML_PRESENT,
"lxml seems not to be present, not testing its XML tree builder.")