diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-21 17:03:13 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-21 17:03:13 -0500 |
commit | 274d94dc13ffeb80c587f68bbad267f4f5199b9e (patch) | |
tree | 79d0597c5cb1a5a514b25fbd8de7a55ae5123f94 | |
parent | ce3742abd4c7fe39247569e82e2b3acdd6052bb1 (diff) |
Created an EntitySubstitution class that's going to take code away from UnicodeDammit, Entities, and BeautifulSoup.
-rw-r--r-- | beautifulsoup/dammit.py | 123 | ||||
-rw-r--r-- | tests/test_soup.py | 67 |
2 files changed, 189 insertions, 1 deletions
diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py index 455b0bf..06e142e 100644 --- a/beautifulsoup/dammit.py +++ b/beautifulsoup/dammit.py @@ -33,6 +33,129 @@ except ImportError: pass +from htmlentitydefs import codepoint2name +import re + +class EntitySubstitution(object): + CHARACTER_TO_HTML_ENTITY = None + CHARACTER_TO_HTML_ENTITY_RE = None + + CHARACTER_TO_XML_ENTITY = { + "'" : "apos", + '"' : "quot", + "&" : "amp", + "<" : "lt", + ">" : "gt", + } + + BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + ")") + + @classmethod + def _initialize_lookup(cls): + if cls.CHARACTER_TO_HTML_ENTITY is not None: + return + lookup = {} + characters = [] + for codepoint, name in codepoint2name.items(): + character = unichr(codepoint) + characters.append(character) + lookup[character] = name + re_definition = "[%s]" % "".join(characters) + cls.CHARACTER_TO_HTML_ENTITY = lookup + cls.CHARACTER_TO_HTML_ENTITY_RE = re.compile(re_definition) + + def __init__(self): + # Initialize the class variables if not already initialized + self._initialize_lookup() + + def _substitute_html_entity(self, matchobj): + entity = self.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) + return "&%s;" % entity + + def _substitute_xml_entity(self, matchobj): + """Used with a regular expression to substitute the + appropriate XML entity for an XML special character.""" + entity = self.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] + return "&%s;" % entity + + def substitute_xml(self, value, make_quoted_attribute=False, + destination_is_xml=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign will + become <, the greater-than sign will become >, and any + ampersands that are not part of an entity defition will + become &. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + + Ordinarily, the string will be quoted using double quotes. + + Bob's Bar -> "Bob's Bar" + + If the string contains double quotes, it will be quoted using + single quotes. + + Welcome to "my bar" -> 'Welcome to "my bar"' + + If the string contains both single and double quotes, the + single quotes will be escaped (see `destination_is_xml`), and + the string will be quoted using single quotes. + + Welcome to "Bob's Bar" -> 'Welcome to "Bob&squot;s bar' + OR + 'Welcome to "Bob's bar' + (depending on the value of `destination_is_xml`) + + :param destination_is_xml: If destination_is_xml is True, + then when a single quote is escaped it will become + "'". But ' is not a valid HTML 4 entity. If + destination_is_xml is False, then single quotes will be + turned into "&squot;". + + The value of this argument is irrelevant unless + make_quoted_attribute is True. + """ + quote_with = '"' + if make_quoted_attribute: + if '"' in value: + quote_with = "'" + if "'" in value: + if destination_is_xml: + replace_with = "'" + else: + replace_with = "&squot;" + value = value.replace("'", replace_with) + + # Escape angle brackets, and ampersands that aren't part of + # entities. + value = self.BARE_AMPERSAND_OR_BRACKET.sub( + self._substitute_xml_entity, value) + if make_quoted_attribute: + return quote_with + value + quote_with + else: + return value + + def substitute_html(self, s): + """Replace certain Unicode characters with named HTML entities. + + This differs from data.encode(encoding, 'xmlcharrefreplace') + in that the goal is to make the result more readable (to those + with ASCII displays) rather than to recover from + errors. There's absolutely nothing wrong with a UTF-8 string + containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that + character with "é" will make it more readable to some + people. + """ + return self.CHARACTER_TO_HTML_ENTITY_RE.sub( + self._substitute_html_entity, s) + + + + class UnicodeDammit: """A class for detecting the encoding of a *ML document and converting it to a Unicode string. If the source encoding is diff --git a/tests/test_soup.py b/tests/test_soup.py index bb2262a..eaedd94 100644 --- a/tests/test_soup.py +++ b/tests/test_soup.py @@ -3,7 +3,7 @@ import unittest from beautifulsoup.element import SoupStrainer -from beautifulsoup.dammit import UnicodeDammit +from beautifulsoup.dammit import EntitySubstitution, UnicodeDammit from beautifulsoup.testing import SoupTest @@ -16,6 +16,71 @@ class TestSelectiveParsing(SoupTest): self.assertEquals(soup.encode(), "<b>Yes</b><b>Yes <c>Yes</c></b>") +class TestEntitySubstitution(unittest.TestCase): + """Standalone tests of the EntitySubstitution class.""" + def setUp(self): + self.sub = EntitySubstitution() + + def test_simple_html_substitution(self): + # Unicode characters corresponding to named HTML entites + # are substituted, and no others. + s = u"foo\u2200\N{SNOWMAN}\u00f5bar" + self.assertEquals(self.sub.substitute_html(s), + u"foo∀\N{SNOWMAN}õbar") + + def test_smart_quote_substitution(self): + # MS smart quotes are a common source of frustration, so we + # give them a special test. + quotes = "\x91\x92foo\x93\x94" + dammit = UnicodeDammit(quotes) + self.assertEquals(self.sub.substitute_html(dammit.markup), + "‘’foo“”") + + def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): + s = 'Welcome to "my bar"' + self.assertEquals(self.sub.substitute_xml(s, False), s) + + def test_xml_attribute_quoting_normally_uses_double_quotes(self): + self.assertEquals(self.sub.substitute_xml("Welcome", True), + '"Welcome"') + self.assertEquals(self.sub.substitute_xml("Bob's Bar", True), + '"Bob\'s Bar"') + + def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): + s = 'Welcome to "my bar"' + self.assertEquals(self.sub.substitute_xml(s, True), + "'Welcome to \"my bar\"'") + + def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): + s = 'Welcome to "Bob\'s Bar"' + # This one is going into an HTML document. + self.assertEquals( + self.sub.substitute_xml(s, True), + "'Welcome to \"Bob&squot;s Bar\"'") + + # This one is going into an XML document. + self.assertEquals( + self.sub.substitute_xml(s, True, destination_is_xml=True), + "'Welcome to \"Bob's Bar\"'") + + def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): + quoted = 'Welcome to "Bob\'s Bar"' + self.assertEquals(self.sub.substitute_xml(quoted), quoted) + + def test_xml_quoting_handles_angle_brackets(self): + self.assertEquals( + self.sub.substitute_xml("foo<bar>"), + "foo<bar>") + + def test_xml_quoting_handles_ampersands(self): + self.assertEquals(self.sub.substitute_xml("AT&T"), "AT&T") + + def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self): + self.assertEquals( + self.sub.substitute_xml("ÁT&T"), + "ÁT&T") + + class TestUnicodeDammit(unittest.TestCase): """Standalone tests of Unicode, Dammit.""" |