summaryrefslogtreecommitdiff
path: root/bs4/builder/__init__.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2017-05-06 13:34:52 -0400
committerLeonard Richardson <leonardr@segfault.org>2017-05-06 13:34:52 -0400
commitc556b8a6b42843fac40c55459aa5c494e2798349 (patch)
tree844e681248fe816addb1187538ba40620d9188db /bs4/builder/__init__.py
parent49cc750524fb436fa4880eefa6c8d0b3bbbd7175 (diff)
HTML parsers treat all HTML4 and HTML5 empty element tags (aka void element tags) correctly. [bug=1656909]
Diffstat (limited to 'bs4/builder/__init__.py')
-rw-r--r--bs4/builder/__init__.py9
1 files changed, 7 insertions, 2 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 601979b..fdb3362 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -232,8 +232,13 @@ class HTMLTreeBuilder(TreeBuilder):
"""
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
- empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
- 'spacer', 'link', 'frame', 'base'])
+ empty_element_tags = set([
+ # These are from HTML5.
+ 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
+
+ # These are from HTML4, removed in HTML5.
+ 'spacer', 'frame'
+ ])
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,