diff options
Diffstat (limited to 'beautifulsoup/element.py')
-rw-r--r-- | beautifulsoup/element.py | 165 |
1 files changed, 81 insertions, 84 deletions
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index 23f8c33..6af27a8 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -11,7 +11,7 @@ from util import isList DEFAULT_OUTPUT_ENCODING = "utf-8" -class PageElement(object): +class PageElement(EntitySubstitution): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -334,6 +334,9 @@ class PageElement(object): class NavigableString(unicode, PageElement): + PREFIX = '' + SUFFIX = '' + def __new__(cls, value): """Create a new NavigableString. @@ -358,29 +361,35 @@ class NavigableString(unicode, PageElement): else: raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) - def decodeGivenEventualEncoding(self, eventual_encoding): - return self + def output_ready(self, substitute_html_entities=False): + if substitute_html_entities: + output = self.substitute_html(self) + else: + output = self + return self.PREFIX + output + self.SUFFIX + class CData(NavigableString): - def decodeGivenEventualEncoding(self, eventual_encoding): - return u'<![CDATA[' + self + u']]>' + PREFIX = u'<![CDATA[' + SUFFIX = u']]>' + class ProcessingInstruction(NavigableString): - def decodeGivenEventualEncoding(self, eventual_encoding): - output = self - if u'%SOUP-ENCODING%' in output: - output = self.substituteEncoding(output, eventual_encoding) - return u'<?' + output + u'?>' + PREFIX = u'<?' + SUFFIX = u'?>' + class Comment(NavigableString): - def decodeGivenEventualEncoding(self, eventual_encoding): - return u'<!--' + self + u'-->' + + PREFIX = u'<!--' + SUFFIX = u'-->' class Declaration(NavigableString): - def decodeGivenEventualEncoding(self, eventual_encoding): - return u'<!' + self + u'>' + PREFIX = u'<!' + SUFFIX = u'!>' + class Doctype(NavigableString): @@ -394,10 +403,11 @@ class Doctype(NavigableString): return Doctype(value) - def decodeGivenEventualEncoding(self, eventual_encoding): - return u'<!DOCTYPE ' + self + u'>' + PREFIX = u'<!DOCTYPE ' + SUFFIX = u'>' + -class Tag(PageElement, EntitySubstitution): +class Tag(PageElement): """Represents a found HTML tag with its attributes and contents.""" @@ -410,19 +420,14 @@ class Tag(PageElement, EntitySubstitution): self.parserClass = parser.__class__ self.name = name if attrs == None: - attrs = [] - if isinstance(attrs, types.DictType): - self.attrMap = attrs + attrs = {} + else: + attrs = dict(attrs) self.attrs = attrs self.contents = [] self.setup(parent, previous) self.hidden = False - if isinstance(attrs, types.DictType): - self.attrs = [kv for kv in attrs.items()] - else: - self.attrs = list(attrs) - # Set up any substitutions, such as the charset in a META tag. self.contains_substitutions = builder.set_up_substitutions(self) @@ -468,15 +473,15 @@ class Tag(PageElement, EntitySubstitution): """Returns the value of the 'key' attribute for the tag, or the value given for 'default' if it doesn't have that attribute.""" - return self._getAttrMap().get(key, default) + return self.attrs.get(key, default) def has_key(self, key): - return self._getAttrMap().has_key(key) + return self.attrs.has_key(key) def __getitem__(self, key): """tag[key] returns the value of the 'key' attribute for the tag, and throws an exception if it's not there.""" - return self._getAttrMap()[key] + return self.attrs[key] def __iter__(self): "Iterating over a tag iterates over its contents." @@ -496,27 +501,12 @@ class Tag(PageElement, EntitySubstitution): def __setitem__(self, key, value): """Setting tag[key] sets the value of the 'key' attribute for the tag.""" - self._getAttrMap() - self.attrMap[key] = value - found = False - for i in range(0, len(self.attrs)): - if self.attrs[i][0] == key: - self.attrs[i] = (key, value) - found = True - if not found: - self.attrs.append((key, value)) - self._getAttrMap()[key] = value + self.attrs[key] = value def __delitem__(self, key): "Deleting tag[key] deletes all 'key' attributes for the tag." - for item in self.attrs: - if item[0] == key: - self.attrs.remove(item) - #We don't break because bad HTML can define the same - #attribute multiple times. - self._getAttrMap() - if self.attrMap.has_key(key): - del self.attrMap[key] + if self.attrs.has_key(key): + del self.attrs[key] def __call__(self, *args, **kwargs): """Calling a tag like a function is the same as calling its @@ -552,7 +542,7 @@ class Tag(PageElement, EntitySubstitution): def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): """Renders this tag as a string.""" - return self.decode(eventual_encoding=encoding) + return self.encode(encoding) def __unicode__(self): return self.decode() @@ -561,17 +551,25 @@ class Tag(PageElement, EntitySubstitution): return self.encode() def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, - pretty_print=False, indent_level=0): - return self.decode(pretty_print, indent_level, encoding).encode(encoding) - - def decode(self, pretty_print=False, indent_level=0, - eventual_encoding=DEFAULT_OUTPUT_ENCODING): - """Returns a string or Unicode representation of this tag and - its contents. To get Unicode, pass None for encoding.""" - + indent_level=None, substitute_html_entities=False): + return self.decode(indent_level, encoding, + substitute_html_entities).encode(encoding) + + def decode(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + substitute_html_entities=False): + """Returns a Unicode representation of this tag and its contents. + + :param eventual_encoding: The tag is destined to be + encoded into this encoding. This method is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a <META> tag that mentions the document's + encoding. + """ attrs = [] if self.attrs: - for key, val in self.attrs: + for key, val in sorted(self.attrs.items()): if val is None: decoded = key else: @@ -591,14 +589,18 @@ class Tag(PageElement, EntitySubstitution): else: closeTag = '</%s>' % self.name - indentTag, indentContents = 0, 0 + pretty_print = (indent_level is not None) if pretty_print: - indentTag = indent_level - space = (' ' * (indentTag-1)) - indentContents = indentTag + 1 - contents = self.decodeContents(pretty_print, indentContents, - eventual_encoding) + space = (' ' * (indent_level-1)) + indent_contents = indent_level + 1 + else: + space = '' + indent_contents = None + contents = self.decode_contents( + indent_contents, eventual_encoding, substitute_html_entities) + if self.hidden: + # This is the 'document root' object. s = contents else: s = [] @@ -634,22 +636,28 @@ class Tag(PageElement, EntitySubstitution): def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): return self.encode(encoding, True) - def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING, - pretty_print=False, indent_level=0): - return self.decodeContents(pretty_print, indent_level).encode(encoding) - - def decodeContents(self, pretty_print=False, indent_level=0, - eventual_encoding=DEFAULT_OUTPUT_ENCODING): - """Renders the contents of this tag as a string in the given - encoding. If encoding is None, returns a Unicode string..""" + def decode_contents(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + substitute_html_entities=False): + """Renders the contents of this tag as a Unicode string. + + :param eventual_encoding: The tag is destined to be + encoded into this encoding. This method is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a <META> tag that mentions the document's + encoding. + """ + pretty_print = (indent_level is not None) s=[] for c in self: text = None if isinstance(c, NavigableString): - text = c.decodeGivenEventualEncoding(eventual_encoding) + text = c.output_ready(substitute_html_entities) elif isinstance(c, Tag): - s.append(c.decode(pretty_print, indent_level, eventual_encoding)) - if text and pretty_print: + s.append(c.decode(indent_level, eventual_encoding, + substitute_html_entities)) + if text and indent_level: text = text.strip() if text: if pretty_print: @@ -690,17 +698,6 @@ class Tag(PageElement, EntitySubstitution): findAll = find_all # BS3 findChildren = find_all # BS2 - #Private methods - - def _getAttrMap(self): - """Initializes a map representation of this tag's attributes, - if not already initialized.""" - if not getattr(self, 'attrMap'): - self.attrMap = {} - for (key, value) in self.attrs: - self.attrMap[key] = value - return self.attrMap - #Generator methods @property def children(self): |