diff options
author | Leonard Richardson <leonardr@segfault.org> | 2021-10-11 16:37:41 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2021-10-11 16:37:41 -0400 |
commit | 283a27a0af4c70d91695a146b7a6cbc82dc28098 (patch) | |
tree | 97ef8be25ff87e82c446eaa4eb462a82e9988ae9 | |
parent | 242a340e5cf8c13449c9a4d73cf55194536a27d1 (diff) |
Added special string classes, RubyParenthesisString and RubyTextString,
to make it possible to treat ruby text specially in get_text() calls.
[bug=1941980]
-rw-r--r-- | CHANGELOG | 4 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 20 | ||||
-rw-r--r-- | bs4/element.py | 26 | ||||
-rw-r--r-- | bs4/tests/test_navigablestring.py | 14 |
4 files changed, 59 insertions, 5 deletions
@@ -7,6 +7,10 @@ Python 2 was revision 605. * Ported unit tests to use pytest. +* Added special string classes, RubyParenthesisString and RubyTextString, + to make it possible to treat ruby text specially in get_text() calls. + [bug=1941980] + = 4.10.0 (20210907) * This is the first release of Beautiful Soup to only support Python diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index bd44905..fa802f4 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -7,6 +7,8 @@ import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, + RubyParenthesisString, + RubyTextString, Stylesheet, Script, TemplateString, @@ -319,7 +321,7 @@ class TreeBuilder(object): values = value attrs[attr] = values return attrs - + class SAXTreeBuilder(TreeBuilder): """A Beautiful Soup treebuilder that listens for SAX events. @@ -390,17 +392,25 @@ class HTMLTreeBuilder(TreeBuilder): # you need to use it. block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) - # The HTML standard defines an unusual content model for these tags. - # We represent this by using a string class other than NavigableString - # inside these tags. + # These HTML tags need special treatment so they can be + # represented by a string class other than NavigableString. # - # I made this list by going through the HTML spec + # For some of these tags, it's because the HTML standard defines + # an unusual content model for them. I made this list by going + # through the HTML spec # (https://html.spec.whatwg.org/#metadata-content) and looking for # "metadata content" elements that can contain strings. # + # The Ruby tags (<rt> and <rp>) are here despite being normal + # "phrasing content" tags, because the content they contain is + # qualitatively different from other text in the document, and it + # can be useful to be able to distinguish it. + # # TODO: Arguably <noscript> could go here but it seems # qualitatively different from the other tags. DEFAULT_STRING_CONTAINERS = { + 'rt' : RubyTextString, + 'rp' : RubyParenthesisString, 'style': Stylesheet, 'script': Script, 'template': TemplateString, diff --git a/bs4/element.py b/bs4/element.py index 3eed924..57a24d0 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1012,6 +1012,11 @@ class NavigableString(str, PageElement): # Do nothing if the caller is looking for specific types of # string, and we're of a different type. + # + # We check specific types instead of using isinstance(self, + # types) because all of these classes subclass + # NavigableString. Anyone who's using this feature probably + # wants generic NavigableStrings but not other stuff. my_type = type(self) if types is not None: if isinstance(types, type): @@ -1140,6 +1145,27 @@ class TemplateString(NavigableString): pass +class RubyTextString(NavigableString): + """A NavigableString representing the contents of the <rt> HTML + element. + + https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element + + Can be used to distinguish such strings from the strings they're + annotating. + """ + pass + + +class RubyParenthesisString(NavigableString): + """A NavigableString representing the contents of the <rp> HTML + element. + + https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element + """ + pass + + class Tag(PageElement): """Represents an HTML or XML tag that is part of a parse tree, along with its attributes and contents. diff --git a/bs4/tests/test_navigablestring.py b/bs4/tests/test_navigablestring.py index 2b76392..649acc0 100644 --- a/bs4/tests/test_navigablestring.py +++ b/bs4/tests/test_navigablestring.py @@ -6,6 +6,8 @@ from bs4.element import ( Declaration, Doctype, NavigableString, + RubyParenthesisString, + RubyTextString, Script, Stylesheet, TemplateString, @@ -128,3 +130,15 @@ class TestNavigableStringSubclasses(SoupTest): soup = self.soup(markup) assert markup == soup.template.encode("utf8") + def test_ruby_strings(self): + markup = "<ruby>漢 <rp>(</rp><rt>kan</rt><rp>)</rp> 字 <rp>(</rp><rt>ji</rt><rp>)</rp></ruby>" + soup = self.soup(markup) + assert isinstance(soup.rp.string, RubyParenthesisString) + assert isinstance(soup.rt.string, RubyTextString) + + # Just as a demo, here's what this means for get_text usage. + assert "漢字" == soup.get_text(strip=True) + assert "漢(kan)字(ji)" == soup.get_text( + strip=True, + types=(NavigableString, RubyTextString, RubyParenthesisString) + ) |