summaryrefslogtreecommitdiff
path: root/bs4/builder/_html5lib.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2020-04-05 15:43:58 -0400
committerLeonard Richardson <leonardr@segfault.org>2020-04-05 15:43:58 -0400
commita6f897b213bb08f0d8d8a1528937541c280abbd6 (patch)
tree866d3392a854ea27a172e9b456b2160307e39363 /bs4/builder/_html5lib.py
parentddadf13ef66122d75eadaf7f10e0937429e6a3a6 (diff)
Embedded CSS and Javascript is now stored in distinct Stylesheet and
Script tags, which are ignored by methods like get_text(). This feature is not supported by the html5lib treebuilder. [bug=1868861]
Diffstat (limited to 'bs4/builder/_html5lib.py')
-rw-r--r--bs4/builder/_html5lib.py16
1 files changed, 15 insertions, 1 deletions
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 32a0856..b36189d 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -39,7 +39,18 @@ except ImportError, e:
new_html5lib = True
class HTML5TreeBuilder(HTMLTreeBuilder):
- """Use html5lib to build a tree."""
+ """Use html5lib to build a tree.
+
+ Note that this TreeBuilder does not support some features common
+ to HTML TreeBuilders. Some of these features could theoretically
+ be implemented, but at the very least it's quite difficult,
+ because html5lib moves the parse tree around as it's being built.
+
+ * This TreeBuilder doesn't use different subclasses of NavigableString
+ based on the name of the tag in which the string was found.
+
+ * You can't use a SoupStrainer to parse only part of a document.
+ """
NAME = "html5lib"
@@ -116,6 +127,9 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
"", "html.parser", store_line_numbers=store_line_numbers,
**kwargs
)
+ # TODO: What are **kwargs exactly? Should they be passed in
+ # here in addition to/instead of being passed to the BeautifulSoup
+ # constructor?
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
# This will be set later to an html5lib.html5parser.HTMLParser