diff options
Diffstat (limited to 'beautifulsoup/dammit.py')
-rw-r--r-- | beautifulsoup/dammit.py | 119 |
1 files changed, 113 insertions, 6 deletions
diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py index 455b0bf..67bec17 100644 --- a/beautifulsoup/dammit.py +++ b/beautifulsoup/dammit.py @@ -7,6 +7,7 @@ encoding; that's the tree builder's job. """ import codecs +from htmlentitydefs import codepoint2name import re import types @@ -21,18 +22,124 @@ try: except ImportError: chardet = None -# Both are available from http://cjkpython.i18n.org/ -# They're built in if you use Python 2.4. -try: - import cjkcodecs.aliases -except ImportError: - pass +# Available from http://cjkpython.i18n.org/. try: import iconv_codec except ImportError: pass +class EntitySubstitution(object): + + def _populate_class_variables(): + lookup = {} + characters = [] + for codepoint, name in codepoint2name.items(): + character = unichr(codepoint) + characters.append(character) + lookup[character] = name + re_definition = "[%s]" % "".join(characters) + return lookup, re.compile(re_definition) + CHARACTER_TO_HTML_ENTITY, CHARACTER_TO_HTML_ENTITY_RE = ( + _populate_class_variables()) + + + CHARACTER_TO_XML_ENTITY = { + "'" : "apos", + '"' : "quot", + "&" : "amp", + "<" : "lt", + ">" : "gt", + } + + BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + ")") + + def _substitute_html_entity(self, matchobj): + entity = self.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) + return "&%s;" % entity + + def _substitute_xml_entity(self, matchobj): + """Used with a regular expression to substitute the + appropriate XML entity for an XML special character.""" + entity = self.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] + return "&%s;" % entity + + def substitute_xml(self, value, make_quoted_attribute=False, + destination_is_xml=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign will + become <, the greater-than sign will become >, and any + ampersands that are not part of an entity defition will + become &. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + + Ordinarily, the string will be quoted using double quotes. + + Bob's Bar -> "Bob's Bar" + + If the string contains double quotes, it will be quoted using + single quotes. + + Welcome to "my bar" -> 'Welcome to "my bar"' + + If the string contains both single and double quotes, the + single quotes will be escaped (see `destination_is_xml`), and + the string will be quoted using single quotes. + + Welcome to "Bob's Bar" -> 'Welcome to "Bob&squot;s bar' + OR + 'Welcome to "Bob's bar' + (depending on the value of `destination_is_xml`) + + :param destination_is_xml: If destination_is_xml is True, + then when a single quote is escaped it will become + "'". But ' is not a valid HTML 4 entity. If + destination_is_xml is False, then single quotes will be + turned into "&squot;". + + The value of this argument is irrelevant unless + make_quoted_attribute is True. + """ + quote_with = '"' + if make_quoted_attribute: + if '"' in value: + quote_with = "'" + if "'" in value: + if destination_is_xml: + replace_with = "'" + else: + replace_with = "&squot;" + value = value.replace("'", replace_with) + + # Escape angle brackets, and ampersands that aren't part of + # entities. + value = self.BARE_AMPERSAND_OR_BRACKET.sub( + self._substitute_xml_entity, value) + if make_quoted_attribute: + return quote_with + value + quote_with + else: + return value + + def substitute_html(self, s): + """Replace certain Unicode characters with named HTML entities. + + This differs from data.encode(encoding, 'xmlcharrefreplace') + in that the goal is to make the result more readable (to those + with ASCII displays) rather than to recover from + errors. There's absolutely nothing wrong with a UTF-8 string + containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that + character with "é" will make it more readable to some + people. + """ + return self.CHARACTER_TO_HTML_ENTITY_RE.sub( + self._substitute_html_entity, s) + + class UnicodeDammit: """A class for detecting the encoding of a *ML document and converting it to a Unicode string. If the source encoding is |