diff options
-rw-r--r-- | beautifulsoup/dammit.py | 25 | ||||
-rw-r--r-- | beautifulsoup/element.py | 53 |
2 files changed, 23 insertions, 55 deletions
diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py index 06e142e..f0690c1 100644 --- a/beautifulsoup/dammit.py +++ b/beautifulsoup/dammit.py @@ -37,8 +37,20 @@ from htmlentitydefs import codepoint2name import re class EntitySubstitution(object): - CHARACTER_TO_HTML_ENTITY = None - CHARACTER_TO_HTML_ENTITY_RE = None + + def _populate_class_variables(): + lookup = {} + characters = [] + for codepoint, name in codepoint2name.items(): + character = unichr(codepoint) + characters.append(character) + lookup[character] = name + re_definition = "[%s]" % "".join(characters) + return lookup, re.compile(re_definition) + + CHARACTER_TO_HTML_ENTITY, CHARACTER_TO_HTML_ENTITY_RE = ( + _populate_class_variables()) + CHARACTER_TO_XML_ENTITY = { "'" : "apos", @@ -56,15 +68,6 @@ class EntitySubstitution(object): def _initialize_lookup(cls): if cls.CHARACTER_TO_HTML_ENTITY is not None: return - lookup = {} - characters = [] - for codepoint, name in codepoint2name.items(): - character = unichr(codepoint) - characters.append(character) - lookup[character] = name - re_definition = "[%s]" % "".join(characters) - cls.CHARACTER_TO_HTML_ENTITY = lookup - cls.CHARACTER_TO_HTML_ENTITY_RE = re.compile(re_definition) def __init__(self): # Initialize the class variables if not already initialized diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index a70813d..6101641 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -4,6 +4,7 @@ try: from htmlentitydefs import name2codepoint except ImportError: name2codepoint = {} +from beautifulsoup.dammit import EntitySubstitution from util import isString, isList @@ -417,7 +418,7 @@ class Doctype(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): return u'<!DOCTYPE ' + self + u'>' -class Tag(PageElement, Entities): +class Tag(PageElement, EntitySubstitution): """Represents a found HTML tag with its attributes and contents.""" @@ -574,15 +575,6 @@ class Tag(PageElement, Entities): """Renders this tag as a string.""" return self.decode(eventualEncoding=encoding) - BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" - + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" - + ")") - - def _sub_entity(self, x): - """Used with a regular expression to substitute the - appropriate XML entity for an XML special character.""" - return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" - def __unicode__(self): return self.decode() @@ -601,45 +593,18 @@ class Tag(PageElement, Entities): attrs = [] if self.attrs: for key, val in self.attrs: - fmt = '%s="%s"' - if isString(val): + if val is None: + decoded = key + else: + if not isString(val): + val = str(val) if (self.contains_substitutions and eventualEncoding is not None and '%SOUP-ENCODING%' in val): val = self.substituteEncoding(val, eventualEncoding) - # The attribute value either: - # - # * Contains no embedded double quotes or single quotes. - # No problem: we enclose it in double quotes. - # * Contains embedded single quotes. No problem: - # double quotes work here too. - # * Contains embedded double quotes. No problem: - # we enclose it in single quotes. - # * Embeds both single _and_ double quotes. This - # can't happen naturally, but it can happen if - # you modify an attribute value after parsing - # the document. Now we have a bit of a - # problem. We solve it by enclosing the - # attribute in single quotes, and escaping any - # embedded single quotes to XML entities. - if '"' in val: - fmt = "%s='%s'" - if "'" in val: - # TODO: replace with apos when - # appropriate. - val = val.replace("'", "&squot;") - - # Now we're okay w/r/t quotes. But the attribute - # value might also contain angle brackets, or - # ampersands that aren't part of entities. We need - # to escape those to XML entities too. - val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) - if val is None: - # Handle boolean attributes. - decoded = key - else: - decoded = fmt % (key, val) + # Set destination_is_xml based on something... + decoded = key + '=' + self.substitute_xml(val, True, False) attrs.append(decoded) close = '' closeTag = '' |