diff options
author | Leonard Richardson <leonardr@segfault.org> | 2021-10-11 16:37:41 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2021-10-11 16:37:41 -0400 |
commit | 283a27a0af4c70d91695a146b7a6cbc82dc28098 (patch) | |
tree | 97ef8be25ff87e82c446eaa4eb462a82e9988ae9 /bs4/builder/__init__.py | |
parent | 242a340e5cf8c13449c9a4d73cf55194536a27d1 (diff) |
Added special string classes, RubyParenthesisString and RubyTextString,
to make it possible to treat ruby text specially in get_text() calls.
[bug=1941980]
Diffstat (limited to 'bs4/builder/__init__.py')
-rw-r--r-- | bs4/builder/__init__.py | 20 |
1 files changed, 15 insertions, 5 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index bd44905..fa802f4 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -7,6 +7,8 @@ import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, + RubyParenthesisString, + RubyTextString, Stylesheet, Script, TemplateString, @@ -319,7 +321,7 @@ class TreeBuilder(object): values = value attrs[attr] = values return attrs - + class SAXTreeBuilder(TreeBuilder): """A Beautiful Soup treebuilder that listens for SAX events. @@ -390,17 +392,25 @@ class HTMLTreeBuilder(TreeBuilder): # you need to use it. block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) - # The HTML standard defines an unusual content model for these tags. - # We represent this by using a string class other than NavigableString - # inside these tags. + # These HTML tags need special treatment so they can be + # represented by a string class other than NavigableString. # - # I made this list by going through the HTML spec + # For some of these tags, it's because the HTML standard defines + # an unusual content model for them. I made this list by going + # through the HTML spec # (https://html.spec.whatwg.org/#metadata-content) and looking for # "metadata content" elements that can contain strings. # + # The Ruby tags (<rt> and <rp>) are here despite being normal + # "phrasing content" tags, because the content they contain is + # qualitatively different from other text in the document, and it + # can be useful to be able to distinguish it. + # # TODO: Arguably <noscript> could go here but it seems # qualitatively different from the other tags. DEFAULT_STRING_CONTAINERS = { + 'rt' : RubyTextString, + 'rp' : RubyParenthesisString, 'style': Stylesheet, 'script': Script, 'template': TemplateString, |