summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--beautifulsoup/__init__.py12
-rw-r--r--beautifulsoup/dammit.py4
-rw-r--r--beautifulsoup/element.py165
-rw-r--r--tests/test_tree.py21
4 files changed, 113 insertions, 89 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index cee55e7..53130e0 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -262,8 +262,9 @@ class BeautifulSoup(Tag):
def handle_data(self, data):
self.currentData.append(data)
- def decode(self, pretty_print=False, indent_level=0,
- eventual_encoding=DEFAULT_OUTPUT_ENCODING):
+ def decode(self, pretty_print=False,
+ eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+ substitute_html_entities=False):
"""Returns a string or Unicode representation of this document.
To get Unicode, pass None for encoding."""
if self.is_xml:
@@ -274,8 +275,13 @@ class BeautifulSoup(Tag):
prefix = u'<?xml version="1.0"%s>\n' % encoding_part
else:
prefix = u''
+ if not pretty_print:
+ indent_level = None
+ else:
+ indent_level = 0
return prefix + super(BeautifulSoup, self).decode(
- pretty_print, indent_level, eventual_encoding)
+ indent_level, eventual_encoding,
+ substitute_html_entities)
class StopParsing(Exception):
diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py
index 9833bd4..31dfa95 100644
--- a/beautifulsoup/dammit.py
+++ b/beautifulsoup/dammit.py
@@ -37,8 +37,8 @@ class EntitySubstitution(object):
for codepoint, name in codepoint2name.items():
if codepoint == 34:
# There's no point in turning the quotation mark into
- # &quot;, unless it happens in an attribute value, which
- # is done elsewhere.
+ # &quot;, unless it happens within an attribute value, which
+ # is handled elsewhere.
continue;
character = unichr(codepoint)
characters.append(character)
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index 23f8c33..6af27a8 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -11,7 +11,7 @@ from util import isList
DEFAULT_OUTPUT_ENCODING = "utf-8"
-class PageElement(object):
+class PageElement(EntitySubstitution):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
@@ -334,6 +334,9 @@ class PageElement(object):
class NavigableString(unicode, PageElement):
+ PREFIX = ''
+ SUFFIX = ''
+
def __new__(cls, value):
"""Create a new NavigableString.
@@ -358,29 +361,35 @@ class NavigableString(unicode, PageElement):
else:
raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
- def decodeGivenEventualEncoding(self, eventual_encoding):
- return self
+ def output_ready(self, substitute_html_entities=False):
+ if substitute_html_entities:
+ output = self.substitute_html(self)
+ else:
+ output = self
+ return self.PREFIX + output + self.SUFFIX
+
class CData(NavigableString):
- def decodeGivenEventualEncoding(self, eventual_encoding):
- return u'<![CDATA[' + self + u']]>'
+ PREFIX = u'<![CDATA['
+ SUFFIX = u']]>'
+
class ProcessingInstruction(NavigableString):
- def decodeGivenEventualEncoding(self, eventual_encoding):
- output = self
- if u'%SOUP-ENCODING%' in output:
- output = self.substituteEncoding(output, eventual_encoding)
- return u'<?' + output + u'?>'
+ PREFIX = u'<?'
+ SUFFIX = u'?>'
+
class Comment(NavigableString):
- def decodeGivenEventualEncoding(self, eventual_encoding):
- return u'<!--' + self + u'-->'
+
+ PREFIX = u'<!--'
+ SUFFIX = u'-->'
class Declaration(NavigableString):
- def decodeGivenEventualEncoding(self, eventual_encoding):
- return u'<!' + self + u'>'
+ PREFIX = u'<!'
+ SUFFIX = u'!>'
+
class Doctype(NavigableString):
@@ -394,10 +403,11 @@ class Doctype(NavigableString):
return Doctype(value)
- def decodeGivenEventualEncoding(self, eventual_encoding):
- return u'<!DOCTYPE ' + self + u'>'
+ PREFIX = u'<!DOCTYPE '
+ SUFFIX = u'>'
+
-class Tag(PageElement, EntitySubstitution):
+class Tag(PageElement):
"""Represents a found HTML tag with its attributes and contents."""
@@ -410,19 +420,14 @@ class Tag(PageElement, EntitySubstitution):
self.parserClass = parser.__class__
self.name = name
if attrs == None:
- attrs = []
- if isinstance(attrs, types.DictType):
- self.attrMap = attrs
+ attrs = {}
+ else:
+ attrs = dict(attrs)
self.attrs = attrs
self.contents = []
self.setup(parent, previous)
self.hidden = False
- if isinstance(attrs, types.DictType):
- self.attrs = [kv for kv in attrs.items()]
- else:
- self.attrs = list(attrs)
-
# Set up any substitutions, such as the charset in a META tag.
self.contains_substitutions = builder.set_up_substitutions(self)
@@ -468,15 +473,15 @@ class Tag(PageElement, EntitySubstitution):
"""Returns the value of the 'key' attribute for the tag, or
the value given for 'default' if it doesn't have that
attribute."""
- return self._getAttrMap().get(key, default)
+ return self.attrs.get(key, default)
def has_key(self, key):
- return self._getAttrMap().has_key(key)
+ return self.attrs.has_key(key)
def __getitem__(self, key):
"""tag[key] returns the value of the 'key' attribute for the tag,
and throws an exception if it's not there."""
- return self._getAttrMap()[key]
+ return self.attrs[key]
def __iter__(self):
"Iterating over a tag iterates over its contents."
@@ -496,27 +501,12 @@ class Tag(PageElement, EntitySubstitution):
def __setitem__(self, key, value):
"""Setting tag[key] sets the value of the 'key' attribute for the
tag."""
- self._getAttrMap()
- self.attrMap[key] = value
- found = False
- for i in range(0, len(self.attrs)):
- if self.attrs[i][0] == key:
- self.attrs[i] = (key, value)
- found = True
- if not found:
- self.attrs.append((key, value))
- self._getAttrMap()[key] = value
+ self.attrs[key] = value
def __delitem__(self, key):
"Deleting tag[key] deletes all 'key' attributes for the tag."
- for item in self.attrs:
- if item[0] == key:
- self.attrs.remove(item)
- #We don't break because bad HTML can define the same
- #attribute multiple times.
- self._getAttrMap()
- if self.attrMap.has_key(key):
- del self.attrMap[key]
+ if self.attrs.has_key(key):
+ del self.attrs[key]
def __call__(self, *args, **kwargs):
"""Calling a tag like a function is the same as calling its
@@ -552,7 +542,7 @@ class Tag(PageElement, EntitySubstitution):
def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
"""Renders this tag as a string."""
- return self.decode(eventual_encoding=encoding)
+ return self.encode(encoding)
def __unicode__(self):
return self.decode()
@@ -561,17 +551,25 @@ class Tag(PageElement, EntitySubstitution):
return self.encode()
def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
- pretty_print=False, indent_level=0):
- return self.decode(pretty_print, indent_level, encoding).encode(encoding)
-
- def decode(self, pretty_print=False, indent_level=0,
- eventual_encoding=DEFAULT_OUTPUT_ENCODING):
- """Returns a string or Unicode representation of this tag and
- its contents. To get Unicode, pass None for encoding."""
-
+ indent_level=None, substitute_html_entities=False):
+ return self.decode(indent_level, encoding,
+ substitute_html_entities).encode(encoding)
+
+ def decode(self, indent_level=None,
+ eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+ substitute_html_entities=False):
+ """Returns a Unicode representation of this tag and its contents.
+
+ :param eventual_encoding: The tag is destined to be
+ encoded into this encoding. This method is _not_
+ responsible for performing that encoding. This information
+ is passed in so that it can be substituted in if the
+ document contains a <META> tag that mentions the document's
+ encoding.
+ """
attrs = []
if self.attrs:
- for key, val in self.attrs:
+ for key, val in sorted(self.attrs.items()):
if val is None:
decoded = key
else:
@@ -591,14 +589,18 @@ class Tag(PageElement, EntitySubstitution):
else:
closeTag = '</%s>' % self.name
- indentTag, indentContents = 0, 0
+ pretty_print = (indent_level is not None)
if pretty_print:
- indentTag = indent_level
- space = (' ' * (indentTag-1))
- indentContents = indentTag + 1
- contents = self.decodeContents(pretty_print, indentContents,
- eventual_encoding)
+ space = (' ' * (indent_level-1))
+ indent_contents = indent_level + 1
+ else:
+ space = ''
+ indent_contents = None
+ contents = self.decode_contents(
+ indent_contents, eventual_encoding, substitute_html_entities)
+
if self.hidden:
+ # This is the 'document root' object.
s = contents
else:
s = []
@@ -634,22 +636,28 @@ class Tag(PageElement, EntitySubstitution):
def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
return self.encode(encoding, True)
- def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
- pretty_print=False, indent_level=0):
- return self.decodeContents(pretty_print, indent_level).encode(encoding)
-
- def decodeContents(self, pretty_print=False, indent_level=0,
- eventual_encoding=DEFAULT_OUTPUT_ENCODING):
- """Renders the contents of this tag as a string in the given
- encoding. If encoding is None, returns a Unicode string.."""
+ def decode_contents(self, indent_level=None,
+ eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+ substitute_html_entities=False):
+ """Renders the contents of this tag as a Unicode string.
+
+ :param eventual_encoding: The tag is destined to be
+ encoded into this encoding. This method is _not_
+ responsible for performing that encoding. This information
+ is passed in so that it can be substituted in if the
+ document contains a <META> tag that mentions the document's
+ encoding.
+ """
+ pretty_print = (indent_level is not None)
s=[]
for c in self:
text = None
if isinstance(c, NavigableString):
- text = c.decodeGivenEventualEncoding(eventual_encoding)
+ text = c.output_ready(substitute_html_entities)
elif isinstance(c, Tag):
- s.append(c.decode(pretty_print, indent_level, eventual_encoding))
- if text and pretty_print:
+ s.append(c.decode(indent_level, eventual_encoding,
+ substitute_html_entities))
+ if text and indent_level:
text = text.strip()
if text:
if pretty_print:
@@ -690,17 +698,6 @@ class Tag(PageElement, EntitySubstitution):
findAll = find_all # BS3
findChildren = find_all # BS2
- #Private methods
-
- def _getAttrMap(self):
- """Initializes a map representation of this tag's attributes,
- if not already initialized."""
- if not getattr(self, 'attrMap'):
- self.attrMap = {}
- for (key, value) in self.attrs:
- self.attrMap[key] = value
- return self.attrMap
-
#Generator methods
@property
def children(self):
diff --git a/tests/test_tree.py b/tests/test_tree.py
index 0b3d72e..ea10367 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -741,6 +741,14 @@ class TestElementObjects(SoupTest):
self.assertTrue(soup.foo.has_key('attr'))
self.assertFalse(soup.foo.has_key('attr2'))
+ def test_attributes_come_out_in_alphabetical_order(self):
+ markup = '<b a="1" z="5" m="3" f="2" y="4"></b>'
+ self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>')
+
+ def test_multiple_values_for_the_same_attribute_are_collapsed(self):
+ markup = '<b b="20" a="1" b="10" a="2" a="3" a="4"></b>'
+ self.assertSoupEquals(markup, '<b a="1" b="20"></b>')
+
def test_string(self):
# A tag that contains only a text node makes that node
# available as .string.
@@ -830,6 +838,19 @@ class TestPersistence(SoupTest):
class TestSubstitutions(SoupTest):
+ def test_html_entity_substitution(self):
+ soup = self.soup(
+ u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>")
+ encoded = soup.encode("utf-8", substitute_html_entities=True)
+ self.assertEquals(encoded,
+ self.document_for("<b>Sacr&eacute; bleu!</b>"))
+
+ def test_html_entity_substitution_off_by_default(self):
+ markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
+ soup = self.soup(markup)
+ encoded = soup.b.encode("utf-8")
+ self.assertEquals(encoded, markup.encode('utf-8'))
+
def test_encoding_substitution(self):
# Here's the <meta> tag saying that a document is
# encoded in Shift-JIS.