Added special string classes, RubyParenthesisString and RubyTextString,

to make it possible to treat ruby text specially in get_text() calls. [bug=1941980]
author: Leonard Richardson <leonardr@segfault.org> 2021-10-11 16:37:41 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2021-10-11 16:37:41 -0400
commit: 283a27a0af4c70d91695a146b7a6cbc82dc28098 (patch)
tree: 97ef8be25ff87e82c446eaa4eb462a82e9988ae9
parent: 242a340e5cf8c13449c9a4d73cf55194536a27d1 (diff)
4 files changed, 59 insertions, 5 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 200c7e5..5e02c4b 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -7,6 +7,10 @@ Python 2 was revision 605.
 
 * Ported unit tests to use pytest.
 
+* Added special string classes, RubyParenthesisString and RubyTextString,
+  to make it possible to treat ruby text specially in get_text() calls.
+  [bug=1941980]
+
 = 4.10.0 (20210907)
 
 * This is the first release of Beautiful Soup to only support Python
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index bd44905..fa802f4 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -7,6 +7,8 @@ import sys
 from bs4.element import (
     CharsetMetaAttributeValue,
     ContentMetaAttributeValue,
+    RubyParenthesisString,
+    RubyTextString,
     Stylesheet,
     Script,
     TemplateString,
@@ -319,7 +321,7 @@ class TreeBuilder(object):
                         values = value
                     attrs[attr] = values
         return attrs
-
+    
 class SAXTreeBuilder(TreeBuilder):
     """A Beautiful Soup treebuilder that listens for SAX events.
 
@@ -390,17 +392,25 @@ class HTMLTreeBuilder(TreeBuilder):
     # you need to use it.
     block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
 
-    # The HTML standard defines an unusual content model for these tags.
-    # We represent this by using a string class other than NavigableString
-    # inside these tags.
+    # These HTML tags need special treatment so they can be
+    # represented by a string class other than NavigableString.
     #
-    # I made this list by going through the HTML spec
+    # For some of these tags, it's because the HTML standard defines
+    # an unusual content model for them. I made this list by going
+    # through the HTML spec
     # (https://html.spec.whatwg.org/#metadata-content) and looking for
     # "metadata content" elements that can contain strings.
     #
+    # The Ruby tags (<rt> and <rp>) are here despite being normal
+    # "phrasing content" tags, because the content they contain is
+    # qualitatively different from other text in the document, and it
+    # can be useful to be able to distinguish it.
+    #
     # TODO: Arguably <noscript> could go here but it seems
     # qualitatively different from the other tags.
     DEFAULT_STRING_CONTAINERS = {
+        'rt' : RubyTextString,
+        'rp' : RubyParenthesisString,
         'style': Stylesheet,
         'script': Script,
         'template': TemplateString,
diff --git a/bs4/element.py b/bs4/element.py
index 3eed924..57a24d0 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1012,6 +1012,11 @@ class NavigableString(str, PageElement):
 
         # Do nothing if the caller is looking for specific types of
         # string, and we're of a different type.
+        #
+        # We check specific types instead of using isinstance(self,
+        # types) because all of these classes subclass
+        # NavigableString. Anyone who's using this feature probably
+        # wants generic NavigableStrings but not other stuff.
         my_type = type(self)
         if types is not None:
             if isinstance(types, type):
@@ -1140,6 +1145,27 @@ class TemplateString(NavigableString):
     pass
 
 
+class RubyTextString(NavigableString):
+    """A NavigableString representing the contents of the <rt> HTML
+    element.
+
+    https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element
+
+    Can be used to distinguish such strings from the strings they're
+    annotating.
+    """
+    pass
+
+
+class RubyParenthesisString(NavigableString):
+    """A NavigableString representing the contents of the <rp> HTML
+    element.
+
+    https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element
+    """
+    pass
+
+
 class Tag(PageElement):
     """Represents an HTML or XML tag that is part of a parse tree, along
     with its attributes and contents.
diff --git a/bs4/tests/test_navigablestring.py b/bs4/tests/test_navigablestring.py
index 2b76392..649acc0 100644
--- a/bs4/tests/test_navigablestring.py
+++ b/bs4/tests/test_navigablestring.py
@@ -6,6 +6,8 @@ from bs4.element import (
     Declaration,
     Doctype,
     NavigableString,
+    RubyParenthesisString,
+    RubyTextString,
     Script,
     Stylesheet,
     TemplateString,
@@ -128,3 +130,15 @@ class TestNavigableStringSubclasses(SoupTest):
         soup = self.soup(markup)
         assert markup == soup.template.encode("utf8")
 
+    def test_ruby_strings(self):
+        markup = "<ruby>漢 <rp>(</rp><rt>kan</rt><rp>)</rp> 字 <rp>(</rp><rt>ji</rt><rp>)</rp></ruby>"
+        soup = self.soup(markup)
+        assert isinstance(soup.rp.string, RubyParenthesisString)
+        assert isinstance(soup.rt.string, RubyTextString)
+
+        # Just as a demo, here's what this means for get_text usage.
+        assert "漢字" == soup.get_text(strip=True)
+        assert "漢(kan)字(ji)" == soup.get_text(
+            strip=True,
+            types=(NavigableString, RubyTextString, RubyParenthesisString)
+        )
author	Leonard Richardson <leonardr@segfault.org>	2021-10-11 16:37:41 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2021-10-11 16:37:41 -0400
commit	283a27a0af4c70d91695a146b7a6cbc82dc28098 (patch)
tree	97ef8be25ff87e82c446eaa4eb462a82e9988ae9
parent	242a340e5cf8c13449c9a4d73cf55194536a27d1 (diff)