summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2021-10-11 16:37:41 -0400
committerLeonard Richardson <leonardr@segfault.org>2021-10-11 16:37:41 -0400
commit283a27a0af4c70d91695a146b7a6cbc82dc28098 (patch)
tree97ef8be25ff87e82c446eaa4eb462a82e9988ae9
parent242a340e5cf8c13449c9a4d73cf55194536a27d1 (diff)
Added special string classes, RubyParenthesisString and RubyTextString,
to make it possible to treat ruby text specially in get_text() calls. [bug=1941980]
-rw-r--r--CHANGELOG4
-rw-r--r--bs4/builder/__init__.py20
-rw-r--r--bs4/element.py26
-rw-r--r--bs4/tests/test_navigablestring.py14
4 files changed, 59 insertions, 5 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 200c7e5..5e02c4b 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -7,6 +7,10 @@ Python 2 was revision 605.
* Ported unit tests to use pytest.
+* Added special string classes, RubyParenthesisString and RubyTextString,
+ to make it possible to treat ruby text specially in get_text() calls.
+ [bug=1941980]
+
= 4.10.0 (20210907)
* This is the first release of Beautiful Soup to only support Python
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index bd44905..fa802f4 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -7,6 +7,8 @@ import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
+ RubyParenthesisString,
+ RubyTextString,
Stylesheet,
Script,
TemplateString,
@@ -319,7 +321,7 @@ class TreeBuilder(object):
values = value
attrs[attr] = values
return attrs
-
+
class SAXTreeBuilder(TreeBuilder):
"""A Beautiful Soup treebuilder that listens for SAX events.
@@ -390,17 +392,25 @@ class HTMLTreeBuilder(TreeBuilder):
# you need to use it.
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
- # The HTML standard defines an unusual content model for these tags.
- # We represent this by using a string class other than NavigableString
- # inside these tags.
+ # These HTML tags need special treatment so they can be
+ # represented by a string class other than NavigableString.
#
- # I made this list by going through the HTML spec
+ # For some of these tags, it's because the HTML standard defines
+ # an unusual content model for them. I made this list by going
+ # through the HTML spec
# (https://html.spec.whatwg.org/#metadata-content) and looking for
# "metadata content" elements that can contain strings.
#
+ # The Ruby tags (<rt> and <rp>) are here despite being normal
+ # "phrasing content" tags, because the content they contain is
+ # qualitatively different from other text in the document, and it
+ # can be useful to be able to distinguish it.
+ #
# TODO: Arguably <noscript> could go here but it seems
# qualitatively different from the other tags.
DEFAULT_STRING_CONTAINERS = {
+ 'rt' : RubyTextString,
+ 'rp' : RubyParenthesisString,
'style': Stylesheet,
'script': Script,
'template': TemplateString,
diff --git a/bs4/element.py b/bs4/element.py
index 3eed924..57a24d0 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1012,6 +1012,11 @@ class NavigableString(str, PageElement):
# Do nothing if the caller is looking for specific types of
# string, and we're of a different type.
+ #
+ # We check specific types instead of using isinstance(self,
+ # types) because all of these classes subclass
+ # NavigableString. Anyone who's using this feature probably
+ # wants generic NavigableStrings but not other stuff.
my_type = type(self)
if types is not None:
if isinstance(types, type):
@@ -1140,6 +1145,27 @@ class TemplateString(NavigableString):
pass
+class RubyTextString(NavigableString):
+ """A NavigableString representing the contents of the <rt> HTML
+ element.
+
+ https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element
+
+ Can be used to distinguish such strings from the strings they're
+ annotating.
+ """
+ pass
+
+
+class RubyParenthesisString(NavigableString):
+ """A NavigableString representing the contents of the <rp> HTML
+ element.
+
+ https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element
+ """
+ pass
+
+
class Tag(PageElement):
"""Represents an HTML or XML tag that is part of a parse tree, along
with its attributes and contents.
diff --git a/bs4/tests/test_navigablestring.py b/bs4/tests/test_navigablestring.py
index 2b76392..649acc0 100644
--- a/bs4/tests/test_navigablestring.py
+++ b/bs4/tests/test_navigablestring.py
@@ -6,6 +6,8 @@ from bs4.element import (
Declaration,
Doctype,
NavigableString,
+ RubyParenthesisString,
+ RubyTextString,
Script,
Stylesheet,
TemplateString,
@@ -128,3 +130,15 @@ class TestNavigableStringSubclasses(SoupTest):
soup = self.soup(markup)
assert markup == soup.template.encode("utf8")
+ def test_ruby_strings(self):
+ markup = "<ruby>漢 <rp>(</rp><rt>kan</rt><rp>)</rp> 字 <rp>(</rp><rt>ji</rt><rp>)</rp></ruby>"
+ soup = self.soup(markup)
+ assert isinstance(soup.rp.string, RubyParenthesisString)
+ assert isinstance(soup.rt.string, RubyTextString)
+
+ # Just as a demo, here's what this means for get_text usage.
+ assert "漢字" == soup.get_text(strip=True)
+ assert "漢(kan)字(ji)" == soup.get_text(
+ strip=True,
+ types=(NavigableString, RubyTextString, RubyParenthesisString)
+ )