1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
from HTMLParser import HTMLParseError
from bs4.builder import HTMLParserTreeBuilder
from bs4.element import CData
from test_lxml import (
TestLXMLBuilder,
TestLXMLBuilderEncodingConversion,
TestLXMLBuilderInvalidMarkup,
)
class TestHTMLParserTreeBuilder(TestLXMLBuilder):
"""See `BuilderSmokeTest`."""
@property
def default_builder(self):
return HTMLParserTreeBuilder()
def test_bare_string(self):
# A bare string is turned into some kind of HTML document or
# fragment recognizable as the original string.
#
# HTMLParser does not modify the bare string at all.
self.assertSoupEquals("A bare string")
def test_cdata_where_its_ok(self):
# HTMLParser recognizes CDATA sections and passes them through.
markup = "<svg><![CDATA[foobar]]></svg>"
self.assertSoupEquals(markup)
soup = self.soup(markup)
string = soup.svg.string
self.assertEquals(string, "foobar")
self.assertTrue(isinstance(string, CData))
# These are tests that could be 'fixed' by improving the
# HTMLParserTreeBuilder, but I don't think it's worth it. Users
# will have fewer headaches if they use one of the other tree
# builders.
def test_empty_element(self):
# HTML's empty-element tags are not recognized as such
# unless they are presented as empty-element tags.
self.assertSoupEquals(
"<p>A <meta> tag</p>", "<p>A <meta> tag</meta></p>")
self.assertSoupEquals(
"<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>")
def test_entities_in_attribute_values_converted_during_parsing(self):
# The numeric entity isn't recognized without the closing
# semicolon.
text = '<x t="piñata">'
expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata"
soup = self.soup(text)
self.assertEquals(soup.x['t'], "piñata")
text = '<x t="piñata">'
expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata"
soup = self.soup(text)
self.assertEquals(soup.x['t'], u"pi\xf1ata")
text = '<x t="piñata">'
soup = self.soup(text)
self.assertEquals(soup.x['t'], expected)
text = '<x t="sacré bleu">'
soup = self.soup(text)
self.assertEquals(
soup.x['t'],
u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu")
# This can cause valid HTML to become invalid.
valid_url = '<a href="http://example.org?a=1&b=2;3">foo</a>'
soup = self.soup(valid_url)
self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3")
# I think it would be very difficult to 'fix' these tests, judging
# from my experience with previous versions of Beautiful Soup.
def test_naked_ampersands(self):
# Ampersands are treated as entities.
text = "<p>AT&T</p>"
soup = self.soup(text)
self.assertEquals(soup.p.string, "AT&T;")
def test_literal_in_textarea(self):
# Anything inside a <textarea> is supposed to be treated as
# the literal value of the field, (XXX citation
# needed). html5lib does this correctly. But, HTMLParser does its
# best to parse the contents of a <textarea> as HTML.
text = '<textarea>Junk like <b> tags and <&<&</textarea>'
soup = self.soup(text)
self.assertEquals(len(soup.textarea.contents), 2)
self.assertEquals(soup.textarea.contents[0], u"Junk like ")
self.assertEquals(soup.textarea.contents[1].name, 'b')
self.assertEquals(soup.textarea.b.string, u" tags and <&<&")
def test_literal_in_script(self):
# The contents of a <script> tag are supposed to be treated as
# a literal string, even if that string contains HTML. But
# HTMLParser attempts to parse some of the HTML, causing much
# pain.
javascript = 'if (i < 2) { alert("<b>foo</b>"); }'
soup = self.soup('<script>%s</script>' % javascript)
self.assertEquals(soup.script.contents,
['if (i < 2) { alert("<b>foo',
'"); }'])
# Namespaced doctypes cause an HTMLParseError
def test_namespaced_system_doctype(self):
self.assertRaises(HTMLParseError, self._test_doctype,
'xsl:stylesheet SYSTEM "htmlent.dtd"')
def test_namespaced_public_doctype(self):
self.assertRaises(HTMLParseError, self._test_doctype,
'xsl:stylesheet PUBLIC "htmlent.dtd"')
class TestHTMLParserTreeBuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
# Oddly enough, HTMLParser seems to handle invalid markup exactly
# the same as lxml.
pass
class TestHTMLParserTreeBuilderEncodingConversion(
TestLXMLBuilderEncodingConversion):
# Re-run the lxml tests for HTMLParser
pass
|