summaryrefslogtreecommitdiff
path: root/bs4/builder/__init__.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2021-10-11 16:37:41 -0400
committerLeonard Richardson <leonardr@segfault.org>2021-10-11 16:37:41 -0400
commit283a27a0af4c70d91695a146b7a6cbc82dc28098 (patch)
tree97ef8be25ff87e82c446eaa4eb462a82e9988ae9 /bs4/builder/__init__.py
parent242a340e5cf8c13449c9a4d73cf55194536a27d1 (diff)
Added special string classes, RubyParenthesisString and RubyTextString,
to make it possible to treat ruby text specially in get_text() calls. [bug=1941980]
Diffstat (limited to 'bs4/builder/__init__.py')
-rw-r--r--bs4/builder/__init__.py20
1 files changed, 15 insertions, 5 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index bd44905..fa802f4 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -7,6 +7,8 @@ import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
+ RubyParenthesisString,
+ RubyTextString,
Stylesheet,
Script,
TemplateString,
@@ -319,7 +321,7 @@ class TreeBuilder(object):
values = value
attrs[attr] = values
return attrs
-
+
class SAXTreeBuilder(TreeBuilder):
"""A Beautiful Soup treebuilder that listens for SAX events.
@@ -390,17 +392,25 @@ class HTMLTreeBuilder(TreeBuilder):
# you need to use it.
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
- # The HTML standard defines an unusual content model for these tags.
- # We represent this by using a string class other than NavigableString
- # inside these tags.
+ # These HTML tags need special treatment so they can be
+ # represented by a string class other than NavigableString.
#
- # I made this list by going through the HTML spec
+ # For some of these tags, it's because the HTML standard defines
+ # an unusual content model for them. I made this list by going
+ # through the HTML spec
# (https://html.spec.whatwg.org/#metadata-content) and looking for
# "metadata content" elements that can contain strings.
#
+ # The Ruby tags (<rt> and <rp>) are here despite being normal
+ # "phrasing content" tags, because the content they contain is
+ # qualitatively different from other text in the document, and it
+ # can be useful to be able to distinguish it.
+ #
# TODO: Arguably <noscript> could go here but it seems
# qualitatively different from the other tags.
DEFAULT_STRING_CONTAINERS = {
+ 'rt' : RubyTextString,
+ 'rp' : RubyParenthesisString,
'style': Stylesheet,
'script': Script,
'template': TemplateString,