diff options
author | Leonard Richardson <leonardr@segfault.org> | 2020-04-05 15:43:58 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2020-04-05 15:43:58 -0400 |
commit | a6f897b213bb08f0d8d8a1528937541c280abbd6 (patch) | |
tree | 866d3392a854ea27a172e9b456b2160307e39363 /bs4/builder/_html5lib.py | |
parent | ddadf13ef66122d75eadaf7f10e0937429e6a3a6 (diff) |
Embedded CSS and Javascript is now stored in distinct Stylesheet and
Script tags, which are ignored by methods like get_text(). This
feature is not supported by the html5lib treebuilder. [bug=1868861]
Diffstat (limited to 'bs4/builder/_html5lib.py')
-rw-r--r-- | bs4/builder/_html5lib.py | 16 |
1 files changed, 15 insertions, 1 deletions
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 32a0856..b36189d 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -39,7 +39,18 @@ except ImportError, e: new_html5lib = True class HTML5TreeBuilder(HTMLTreeBuilder): - """Use html5lib to build a tree.""" + """Use html5lib to build a tree. + + Note that this TreeBuilder does not support some features common + to HTML TreeBuilders. Some of these features could theoretically + be implemented, but at the very least it's quite difficult, + because html5lib moves the parse tree around as it's being built. + + * This TreeBuilder doesn't use different subclasses of NavigableString + based on the name of the tag in which the string was found. + + * You can't use a SoupStrainer to parse only part of a document. + """ NAME = "html5lib" @@ -116,6 +127,9 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): "", "html.parser", store_line_numbers=store_line_numbers, **kwargs ) + # TODO: What are **kwargs exactly? Should they be passed in + # here in addition to/instead of being passed to the BeautifulSoup + # constructor? super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) # This will be set later to an html5lib.html5parser.HTMLParser |