diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-21 17:11:56 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-21 17:11:56 -0500 |
commit | 384c5cd07b027b1a53d59d81fd3dc7661cbc2ab1 (patch) | |
tree | 3de877db036706e3f11981a4d33968dea4a020e1 /beautifulsoup/element.py | |
parent | 274d94dc13ffeb80c587f68bbad267f4f5199b9e (diff) |
Switched Tag.decode to use EntitySubstitution.substitute_xml.
Diffstat (limited to 'beautifulsoup/element.py')
-rw-r--r-- | beautifulsoup/element.py | 53 |
1 files changed, 9 insertions, 44 deletions
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index a70813d..6101641 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -4,6 +4,7 @@ try: from htmlentitydefs import name2codepoint except ImportError: name2codepoint = {} +from beautifulsoup.dammit import EntitySubstitution from util import isString, isList @@ -417,7 +418,7 @@ class Doctype(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): return u'<!DOCTYPE ' + self + u'>' -class Tag(PageElement, Entities): +class Tag(PageElement, EntitySubstitution): """Represents a found HTML tag with its attributes and contents.""" @@ -574,15 +575,6 @@ class Tag(PageElement, Entities): """Renders this tag as a string.""" return self.decode(eventualEncoding=encoding) - BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" - + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" - + ")") - - def _sub_entity(self, x): - """Used with a regular expression to substitute the - appropriate XML entity for an XML special character.""" - return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" - def __unicode__(self): return self.decode() @@ -601,45 +593,18 @@ class Tag(PageElement, Entities): attrs = [] if self.attrs: for key, val in self.attrs: - fmt = '%s="%s"' - if isString(val): + if val is None: + decoded = key + else: + if not isString(val): + val = str(val) if (self.contains_substitutions and eventualEncoding is not None and '%SOUP-ENCODING%' in val): val = self.substituteEncoding(val, eventualEncoding) - # The attribute value either: - # - # * Contains no embedded double quotes or single quotes. - # No problem: we enclose it in double quotes. - # * Contains embedded single quotes. No problem: - # double quotes work here too. - # * Contains embedded double quotes. No problem: - # we enclose it in single quotes. - # * Embeds both single _and_ double quotes. This - # can't happen naturally, but it can happen if - # you modify an attribute value after parsing - # the document. Now we have a bit of a - # problem. We solve it by enclosing the - # attribute in single quotes, and escaping any - # embedded single quotes to XML entities. - if '"' in val: - fmt = "%s='%s'" - if "'" in val: - # TODO: replace with apos when - # appropriate. - val = val.replace("'", "&squot;") - - # Now we're okay w/r/t quotes. But the attribute - # value might also contain angle brackets, or - # ampersands that aren't part of entities. We need - # to escape those to XML entities too. - val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) - if val is None: - # Handle boolean attributes. - decoded = key - else: - decoded = fmt % (key, val) + # Set destination_is_xml based on something... + decoded = key + '=' + self.substitute_xml(val, True, False) attrs.append(decoded) close = '' closeTag = '' |