diff options
-rw-r--r-- | beautifulsoup/__init__.py | 12 | ||||
-rw-r--r-- | beautifulsoup/dammit.py | 4 | ||||
-rw-r--r-- | beautifulsoup/element.py | 165 | ||||
-rw-r--r-- | tests/test_tree.py | 21 |
4 files changed, 113 insertions, 89 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py index cee55e7..53130e0 100644 --- a/beautifulsoup/__init__.py +++ b/beautifulsoup/__init__.py @@ -262,8 +262,9 @@ class BeautifulSoup(Tag): def handle_data(self, data): self.currentData.append(data) - def decode(self, pretty_print=False, indent_level=0, - eventual_encoding=DEFAULT_OUTPUT_ENCODING): + def decode(self, pretty_print=False, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + substitute_html_entities=False): """Returns a string or Unicode representation of this document. To get Unicode, pass None for encoding.""" if self.is_xml: @@ -274,8 +275,13 @@ class BeautifulSoup(Tag): prefix = u'<?xml version="1.0"%s>\n' % encoding_part else: prefix = u'' + if not pretty_print: + indent_level = None + else: + indent_level = 0 return prefix + super(BeautifulSoup, self).decode( - pretty_print, indent_level, eventual_encoding) + indent_level, eventual_encoding, + substitute_html_entities) class StopParsing(Exception): diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py index 9833bd4..31dfa95 100644 --- a/beautifulsoup/dammit.py +++ b/beautifulsoup/dammit.py @@ -37,8 +37,8 @@ class EntitySubstitution(object): for codepoint, name in codepoint2name.items(): if codepoint == 34: # There's no point in turning the quotation mark into - # ", unless it happens in an attribute value, which - # is done elsewhere. + # ", unless it happens within an attribute value, which + # is handled elsewhere. continue; character = unichr(codepoint) characters.append(character) diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index 23f8c33..6af27a8 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -11,7 +11,7 @@ from util import isList DEFAULT_OUTPUT_ENCODING = "utf-8" -class PageElement(object): +class PageElement(EntitySubstitution): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -334,6 +334,9 @@ class PageElement(object): class NavigableString(unicode, PageElement): + PREFIX = '' + SUFFIX = '' + def __new__(cls, value): """Create a new NavigableString. @@ -358,29 +361,35 @@ class NavigableString(unicode, PageElement): else: raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) - def decodeGivenEventualEncoding(self, eventual_encoding): - return self + def output_ready(self, substitute_html_entities=False): + if substitute_html_entities: + output = self.substitute_html(self) + else: + output = self + return self.PREFIX + output + self.SUFFIX + class CData(NavigableString): - def decodeGivenEventualEncoding(self, eventual_encoding): - return u'<![CDATA[' + self + u']]>' + PREFIX = u'<![CDATA[' + SUFFIX = u']]>' + class ProcessingInstruction(NavigableString): - def decodeGivenEventualEncoding(self, eventual_encoding): - output = self - if u'%SOUP-ENCODING%' in output: - output = self.substituteEncoding(output, eventual_encoding) - return u'<?' + output + u'?>' + PREFIX = u'<?' + SUFFIX = u'?>' + class Comment(NavigableString): - def decodeGivenEventualEncoding(self, eventual_encoding): - return u'<!--' + self + u'-->' + + PREFIX = u'<!--' + SUFFIX = u'-->' class Declaration(NavigableString): - def decodeGivenEventualEncoding(self, eventual_encoding): - return u'<!' + self + u'>' + PREFIX = u'<!' + SUFFIX = u'!>' + class Doctype(NavigableString): @@ -394,10 +403,11 @@ class Doctype(NavigableString): return Doctype(value) - def decodeGivenEventualEncoding(self, eventual_encoding): - return u'<!DOCTYPE ' + self + u'>' + PREFIX = u'<!DOCTYPE ' + SUFFIX = u'>' + -class Tag(PageElement, EntitySubstitution): +class Tag(PageElement): """Represents a found HTML tag with its attributes and contents.""" @@ -410,19 +420,14 @@ class Tag(PageElement, EntitySubstitution): self.parserClass = parser.__class__ self.name = name if attrs == None: - attrs = [] - if isinstance(attrs, types.DictType): - self.attrMap = attrs + attrs = {} + else: + attrs = dict(attrs) self.attrs = attrs self.contents = [] self.setup(parent, previous) self.hidden = False - if isinstance(attrs, types.DictType): - self.attrs = [kv for kv in attrs.items()] - else: - self.attrs = list(attrs) - # Set up any substitutions, such as the charset in a META tag. self.contains_substitutions = builder.set_up_substitutions(self) @@ -468,15 +473,15 @@ class Tag(PageElement, EntitySubstitution): """Returns the value of the 'key' attribute for the tag, or the value given for 'default' if it doesn't have that attribute.""" - return self._getAttrMap().get(key, default) + return self.attrs.get(key, default) def has_key(self, key): - return self._getAttrMap().has_key(key) + return self.attrs.has_key(key) def __getitem__(self, key): """tag[key] returns the value of the 'key' attribute for the tag, and throws an exception if it's not there.""" - return self._getAttrMap()[key] + return self.attrs[key] def __iter__(self): "Iterating over a tag iterates over its contents." @@ -496,27 +501,12 @@ class Tag(PageElement, EntitySubstitution): def __setitem__(self, key, value): """Setting tag[key] sets the value of the 'key' attribute for the tag.""" - self._getAttrMap() - self.attrMap[key] = value - found = False - for i in range(0, len(self.attrs)): - if self.attrs[i][0] == key: - self.attrs[i] = (key, value) - found = True - if not found: - self.attrs.append((key, value)) - self._getAttrMap()[key] = value + self.attrs[key] = value def __delitem__(self, key): "Deleting tag[key] deletes all 'key' attributes for the tag." - for item in self.attrs: - if item[0] == key: - self.attrs.remove(item) - #We don't break because bad HTML can define the same - #attribute multiple times. - self._getAttrMap() - if self.attrMap.has_key(key): - del self.attrMap[key] + if self.attrs.has_key(key): + del self.attrs[key] def __call__(self, *args, **kwargs): """Calling a tag like a function is the same as calling its @@ -552,7 +542,7 @@ class Tag(PageElement, EntitySubstitution): def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): """Renders this tag as a string.""" - return self.decode(eventual_encoding=encoding) + return self.encode(encoding) def __unicode__(self): return self.decode() @@ -561,17 +551,25 @@ class Tag(PageElement, EntitySubstitution): return self.encode() def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, - pretty_print=False, indent_level=0): - return self.decode(pretty_print, indent_level, encoding).encode(encoding) - - def decode(self, pretty_print=False, indent_level=0, - eventual_encoding=DEFAULT_OUTPUT_ENCODING): - """Returns a string or Unicode representation of this tag and - its contents. To get Unicode, pass None for encoding.""" - + indent_level=None, substitute_html_entities=False): + return self.decode(indent_level, encoding, + substitute_html_entities).encode(encoding) + + def decode(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + substitute_html_entities=False): + """Returns a Unicode representation of this tag and its contents. + + :param eventual_encoding: The tag is destined to be + encoded into this encoding. This method is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a <META> tag that mentions the document's + encoding. + """ attrs = [] if self.attrs: - for key, val in self.attrs: + for key, val in sorted(self.attrs.items()): if val is None: decoded = key else: @@ -591,14 +589,18 @@ class Tag(PageElement, EntitySubstitution): else: closeTag = '</%s>' % self.name - indentTag, indentContents = 0, 0 + pretty_print = (indent_level is not None) if pretty_print: - indentTag = indent_level - space = (' ' * (indentTag-1)) - indentContents = indentTag + 1 - contents = self.decodeContents(pretty_print, indentContents, - eventual_encoding) + space = (' ' * (indent_level-1)) + indent_contents = indent_level + 1 + else: + space = '' + indent_contents = None + contents = self.decode_contents( + indent_contents, eventual_encoding, substitute_html_entities) + if self.hidden: + # This is the 'document root' object. s = contents else: s = [] @@ -634,22 +636,28 @@ class Tag(PageElement, EntitySubstitution): def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): return self.encode(encoding, True) - def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING, - pretty_print=False, indent_level=0): - return self.decodeContents(pretty_print, indent_level).encode(encoding) - - def decodeContents(self, pretty_print=False, indent_level=0, - eventual_encoding=DEFAULT_OUTPUT_ENCODING): - """Renders the contents of this tag as a string in the given - encoding. If encoding is None, returns a Unicode string..""" + def decode_contents(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + substitute_html_entities=False): + """Renders the contents of this tag as a Unicode string. + + :param eventual_encoding: The tag is destined to be + encoded into this encoding. This method is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a <META> tag that mentions the document's + encoding. + """ + pretty_print = (indent_level is not None) s=[] for c in self: text = None if isinstance(c, NavigableString): - text = c.decodeGivenEventualEncoding(eventual_encoding) + text = c.output_ready(substitute_html_entities) elif isinstance(c, Tag): - s.append(c.decode(pretty_print, indent_level, eventual_encoding)) - if text and pretty_print: + s.append(c.decode(indent_level, eventual_encoding, + substitute_html_entities)) + if text and indent_level: text = text.strip() if text: if pretty_print: @@ -690,17 +698,6 @@ class Tag(PageElement, EntitySubstitution): findAll = find_all # BS3 findChildren = find_all # BS2 - #Private methods - - def _getAttrMap(self): - """Initializes a map representation of this tag's attributes, - if not already initialized.""" - if not getattr(self, 'attrMap'): - self.attrMap = {} - for (key, value) in self.attrs: - self.attrMap[key] = value - return self.attrMap - #Generator methods @property def children(self): diff --git a/tests/test_tree.py b/tests/test_tree.py index 0b3d72e..ea10367 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -741,6 +741,14 @@ class TestElementObjects(SoupTest): self.assertTrue(soup.foo.has_key('attr')) self.assertFalse(soup.foo.has_key('attr2')) + def test_attributes_come_out_in_alphabetical_order(self): + markup = '<b a="1" z="5" m="3" f="2" y="4"></b>' + self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>') + + def test_multiple_values_for_the_same_attribute_are_collapsed(self): + markup = '<b b="20" a="1" b="10" a="2" a="3" a="4"></b>' + self.assertSoupEquals(markup, '<b a="1" b="20"></b>') + def test_string(self): # A tag that contains only a text node makes that node # available as .string. @@ -830,6 +838,19 @@ class TestPersistence(SoupTest): class TestSubstitutions(SoupTest): + def test_html_entity_substitution(self): + soup = self.soup( + u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>") + encoded = soup.encode("utf-8", substitute_html_entities=True) + self.assertEquals(encoded, + self.document_for("<b>Sacré bleu!</b>")) + + def test_html_entity_substitution_off_by_default(self): + markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" + soup = self.soup(markup) + encoded = soup.b.encode("utf-8") + self.assertEquals(encoded, markup.encode('utf-8')) + def test_encoding_substitution(self): # Here's the <meta> tag saying that a document is # encoded in Shift-JIS. |