Added special string classes, RubyParenthesisString and RubyTextString,

to make it possible to treat ruby text specially in get_text() calls. [bug=1941980]
author: Leonard Richardson <leonardr@segfault.org> 2021-10-11 16:37:41 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2021-10-11 16:37:41 -0400
commit: 283a27a0af4c70d91695a146b7a6cbc82dc28098 (patch)
tree: 97ef8be25ff87e82c446eaa4eb462a82e9988ae9 /bs4/builder/__init__.py
parent: 242a340e5cf8c13449c9a4d73cf55194536a27d1 (diff)
1 files changed, 15 insertions, 5 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index bd44905..fa802f4 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -7,6 +7,8 @@ import sys
 from bs4.element import (
     CharsetMetaAttributeValue,
     ContentMetaAttributeValue,
+    RubyParenthesisString,
+    RubyTextString,
     Stylesheet,
     Script,
     TemplateString,
@@ -319,7 +321,7 @@ class TreeBuilder(object):
                         values = value
                     attrs[attr] = values
         return attrs
-
+    
 class SAXTreeBuilder(TreeBuilder):
     """A Beautiful Soup treebuilder that listens for SAX events.
 
@@ -390,17 +392,25 @@ class HTMLTreeBuilder(TreeBuilder):
     # you need to use it.
     block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
 
-    # The HTML standard defines an unusual content model for these tags.
-    # We represent this by using a string class other than NavigableString
-    # inside these tags.
+    # These HTML tags need special treatment so they can be
+    # represented by a string class other than NavigableString.
     #
-    # I made this list by going through the HTML spec
+    # For some of these tags, it's because the HTML standard defines
+    # an unusual content model for them. I made this list by going
+    # through the HTML spec
     # (https://html.spec.whatwg.org/#metadata-content) and looking for
     # "metadata content" elements that can contain strings.
     #
+    # The Ruby tags (<rt> and <rp>) are here despite being normal
+    # "phrasing content" tags, because the content they contain is
+    # qualitatively different from other text in the document, and it
+    # can be useful to be able to distinguish it.
+    #
     # TODO: Arguably <noscript> could go here but it seems
     # qualitatively different from the other tags.
     DEFAULT_STRING_CONTAINERS = {
+        'rt' : RubyTextString,
+        'rp' : RubyParenthesisString,
         'style': Stylesheet,
         'script': Script,
         'template': TemplateString,
author	Leonard Richardson <leonardr@segfault.org>	2021-10-11 16:37:41 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2021-10-11 16:37:41 -0400
commit	283a27a0af4c70d91695a146b7a6cbc82dc28098 (patch)
tree	97ef8be25ff87e82c446eaa4eb462a82e9988ae9 /bs4/builder/__init__.py
parent	242a340e5cf8c13449c9a4d73cf55194536a27d1 (diff)