From a6f897b213bb08f0d8d8a1528937541c280abbd6 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sun, 5 Apr 2020 15:43:58 -0400 Subject: Embedded CSS and Javascript is now stored in distinct Stylesheet and Script tags, which are ignored by methods like get_text(). This feature is not supported by the html5lib treebuilder. [bug=1868861] --- bs4/builder/_html5lib.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'bs4/builder/_html5lib.py') diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 32a0856..b36189d 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -39,7 +39,18 @@ except ImportError, e: new_html5lib = True class HTML5TreeBuilder(HTMLTreeBuilder): - """Use html5lib to build a tree.""" + """Use html5lib to build a tree. + + Note that this TreeBuilder does not support some features common + to HTML TreeBuilders. Some of these features could theoretically + be implemented, but at the very least it's quite difficult, + because html5lib moves the parse tree around as it's being built. + + * This TreeBuilder doesn't use different subclasses of NavigableString + based on the name of the tag in which the string was found. + + * You can't use a SoupStrainer to parse only part of a document. + """ NAME = "html5lib" @@ -116,6 +127,9 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): "", "html.parser", store_line_numbers=store_line_numbers, **kwargs ) + # TODO: What are **kwargs exactly? Should they be passed in + # here in addition to/instead of being passed to the BeautifulSoup + # constructor? super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) # This will be set later to an html5lib.html5parser.HTMLParser -- cgit v1.2.3