From d7056f49c8bb3a448cec2f1a6f2de55e93c8e8d6 Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonard.richardson@canonical.com>
Date: Sat, 26 Feb 2011 21:26:15 -0500
Subject: First stab at HTML entity replacement.

---
 beautifulsoup/element.py | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

(limited to 'beautifulsoup/element.py')

diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index 23f8c33..f3a59d4 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -561,11 +561,14 @@ class Tag(PageElement, EntitySubstitution):
         return self.encode()
 
     def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
-               pretty_print=False, indent_level=0):
-        return self.decode(pretty_print, indent_level, encoding).encode(encoding)
+               pretty_print=False, indent_level=0,
+               replace_with_html_entities=False):
+        return self.decode(pretty_print, indent_level, encoding,
+                           replace_with_html_entities).encode(encoding)
 
     def decode(self, pretty_print=False, indent_level=0,
-               eventual_encoding=DEFAULT_OUTPUT_ENCODING):
+               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+               replace_with_html_entities=False):
         """Returns a string or Unicode representation of this tag and
         its contents. To get Unicode, pass None for encoding."""
 
@@ -597,7 +600,8 @@ class Tag(PageElement, EntitySubstitution):
             space = (' ' * (indentTag-1))
             indentContents = indentTag + 1
         contents = self.decodeContents(pretty_print, indentContents,
-                                       eventual_encoding)
+                                       eventual_encoding,
+                                       replace_with_html_entities)
         if self.hidden:
             s = contents
         else:
@@ -635,11 +639,15 @@ class Tag(PageElement, EntitySubstitution):
         return self.encode(encoding, True)
 
     def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
-                       pretty_print=False, indent_level=0):
-        return self.decodeContents(pretty_print, indent_level).encode(encoding)
+                       pretty_print=False, indent_level=0,
+                       replace_With_html_entities=False):
+        return self.decodeContents(
+            pretty_print, indent_level, replace_with_html_entities).encode(
+            encoding)
 
     def decodeContents(self, pretty_print=False, indent_level=0,
-                       eventual_encoding=DEFAULT_OUTPUT_ENCODING):
+                       eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+                       replace_with_html_entities=False):
         """Renders the contents of this tag as a string in the given
         encoding. If encoding is None, returns a Unicode string.."""
         s=[]
@@ -648,10 +656,13 @@ class Tag(PageElement, EntitySubstitution):
             if isinstance(c, NavigableString):
                 text = c.decodeGivenEventualEncoding(eventual_encoding)
             elif isinstance(c, Tag):
-                s.append(c.decode(pretty_print, indent_level, eventual_encoding))
+                s.append(c.decode(pretty_print, indent_level, eventual_encoding,
+                                  replace_with_html_entities))
             if text and pretty_print:
                 text = text.strip()
             if text:
+                if replace_with_html_entities:
+                    text = self.substitute_html(text)
                 if pretty_print:
                     s.append(" " * (indent_level-1))
                 s.append(text)
-- 
cgit v1.2.3


From 247785aa53358fa64e9bd5f799c4c9a1609489f0 Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonard.richardson@canonical.com>
Date: Sat, 26 Feb 2011 22:54:04 -0500
Subject: Renamed replace_with_html_entities to substitute_html_entities.

---
 beautifulsoup/element.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'beautifulsoup/element.py')

diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index f3a59d4..bbecdbd 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -562,13 +562,13 @@ class Tag(PageElement, EntitySubstitution):
 
     def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
                pretty_print=False, indent_level=0,
-               replace_with_html_entities=False):
+               substitute_html_entities=False):
         return self.decode(pretty_print, indent_level, encoding,
-                           replace_with_html_entities).encode(encoding)
+                           substitute_html_entities).encode(encoding)
 
     def decode(self, pretty_print=False, indent_level=0,
                eventual_encoding=DEFAULT_OUTPUT_ENCODING,
-               replace_with_html_entities=False):
+               substitute_html_entities=False):
         """Returns a string or Unicode representation of this tag and
         its contents. To get Unicode, pass None for encoding."""
 
@@ -601,7 +601,7 @@ class Tag(PageElement, EntitySubstitution):
             indentContents = indentTag + 1
         contents = self.decodeContents(pretty_print, indentContents,
                                        eventual_encoding,
-                                       replace_with_html_entities)
+                                       substitute_html_entities)
         if self.hidden:
             s = contents
         else:
@@ -642,12 +642,12 @@ class Tag(PageElement, EntitySubstitution):
                        pretty_print=False, indent_level=0,
                        replace_With_html_entities=False):
         return self.decodeContents(
-            pretty_print, indent_level, replace_with_html_entities).encode(
+            pretty_print, indent_level, substitute_html_entities).encode(
             encoding)
 
     def decodeContents(self, pretty_print=False, indent_level=0,
                        eventual_encoding=DEFAULT_OUTPUT_ENCODING,
-                       replace_with_html_entities=False):
+                       substitute_html_entities=False):
         """Renders the contents of this tag as a string in the given
         encoding. If encoding is None, returns a Unicode string.."""
         s=[]
@@ -657,11 +657,11 @@ class Tag(PageElement, EntitySubstitution):
                 text = c.decodeGivenEventualEncoding(eventual_encoding)
             elif isinstance(c, Tag):
                 s.append(c.decode(pretty_print, indent_level, eventual_encoding,
-                                  replace_with_html_entities))
+                                  substitute_html_entities))
             if text and pretty_print:
                 text = text.strip()
             if text:
-                if replace_with_html_entities:
+                if substitute_html_entities:
                     text = self.substitute_html(text)
                 if pretty_print:
                     s.append(" " * (indent_level-1))
-- 
cgit v1.2.3


From cb369db2e4a16816ae78ff6eff7d8d3393141569 Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonard.richardson@canonical.com>
Date: Sat, 26 Feb 2011 23:05:10 -0500
Subject: Refactored the code that makes a string output-ready.

---
 beautifulsoup/element.py | 48 ++++++++++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 20 deletions(-)

(limited to 'beautifulsoup/element.py')

diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index bbecdbd..3515d50 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -11,7 +11,7 @@ from util import isList
 DEFAULT_OUTPUT_ENCODING = "utf-8"
 
 
-class PageElement(object):
+class PageElement(EntitySubstitution):
     """Contains the navigational information for some part of the page
     (either a tag or a piece of text)"""
 
@@ -334,6 +334,9 @@ class PageElement(object):
 
 class NavigableString(unicode, PageElement):
 
+    PREFIX = ''
+    SUFFIX = ''
+
     def __new__(cls, value):
         """Create a new NavigableString.
 
@@ -358,29 +361,35 @@ class NavigableString(unicode, PageElement):
         else:
             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
 
-    def decodeGivenEventualEncoding(self, eventual_encoding):
-        return self
+    def output_ready(self, substitute_html_entities=False):
+        if substitute_html_entities:
+            output = self.substitute_html(self)
+        else:
+            output = self
+        return self.PREFIX + output + self.SUFFIX
+
 
 class CData(NavigableString):
 
-    def decodeGivenEventualEncoding(self, eventual_encoding):
-        return u'<![CDATA[' + self + u']]>'
+    PREFIX = u'<![CDATA['
+    SUFFIX = u']]>'
+
 
 class ProcessingInstruction(NavigableString):
 
-    def decodeGivenEventualEncoding(self, eventual_encoding):
-        output = self
-        if u'%SOUP-ENCODING%' in output:
-            output = self.substituteEncoding(output, eventual_encoding)
-        return u'<?' + output + u'?>'
+    PREFIX = u'<?'
+    SUFFIX = u'?>'
+
 
 class Comment(NavigableString):
-    def decodeGivenEventualEncoding(self, eventual_encoding):
-        return u'<!--' + self + u'-->'
+
+    PREFIX = u'<!--'
+    SUFFIX = u'-->'
 
 class Declaration(NavigableString):
-    def decodeGivenEventualEncoding(self, eventual_encoding):
-        return u'<!' + self + u'>'
+    PREFIX = u'<!'
+    SUFFIX = u'!>'
+
 
 class Doctype(NavigableString):
 
@@ -394,10 +403,11 @@ class Doctype(NavigableString):
 
         return Doctype(value)
 
-    def decodeGivenEventualEncoding(self, eventual_encoding):
-        return u'<!DOCTYPE ' + self + u'>'
+    PREFIX = u'<!DOCTYPE '
+    SUFFIX = u'>'
+
 
-class Tag(PageElement, EntitySubstitution):
+class Tag(PageElement):
 
     """Represents a found HTML tag with its attributes and contents."""
 
@@ -654,15 +664,13 @@ class Tag(PageElement, EntitySubstitution):
         for c in self:
             text = None
             if isinstance(c, NavigableString):
-                text = c.decodeGivenEventualEncoding(eventual_encoding)
+                text = c.output_ready(substitute_html_entities)
             elif isinstance(c, Tag):
                 s.append(c.decode(pretty_print, indent_level, eventual_encoding,
                                   substitute_html_entities))
             if text and pretty_print:
                 text = text.strip()
             if text:
-                if substitute_html_entities:
-                    text = self.substitute_html(text)
                 if pretty_print:
                     s.append(" " * (indent_level-1))
                 s.append(text)
-- 
cgit v1.2.3


From a5f2ee72f16d0b6755ab84ddf2eff8e3d412e755 Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonard.richardson@canonical.com>
Date: Sat, 26 Feb 2011 23:11:54 -0500
Subject: Cleaned up decodeContents, and removed encodeContents, which isn't
 used.

---
 beautifulsoup/element.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'beautifulsoup/element.py')

diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index 3515d50..e6e7adb 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -609,7 +609,7 @@ class Tag(PageElement):
             indentTag = indent_level
             space = (' ' * (indentTag-1))
             indentContents = indentTag + 1
-        contents = self.decodeContents(pretty_print, indentContents,
+        contents = self.decode_contents(pretty_print, indentContents,
                                        eventual_encoding,
                                        substitute_html_entities)
         if self.hidden:
@@ -648,18 +648,18 @@ class Tag(PageElement):
     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
         return self.encode(encoding, True)
 
-    def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
-                       pretty_print=False, indent_level=0,
-                       replace_With_html_entities=False):
-        return self.decodeContents(
-            pretty_print, indent_level, substitute_html_entities).encode(
-            encoding)
-
-    def decodeContents(self, pretty_print=False, indent_level=0,
+    def decode_contents(self, pretty_print=False, indent_level=0,
                        eventual_encoding=DEFAULT_OUTPUT_ENCODING,
                        substitute_html_entities=False):
-        """Renders the contents of this tag as a string in the given
-        encoding. If encoding is None, returns a Unicode string.."""
+        """Renders the contents of this tag as a Unicode string.
+
+        :param eventual_encoding: The document is destined to be
+           encoded into this encoding. This method is _not_
+           responsible for performing that encoding. This information
+           is passed in so that it can be substituted in if the
+           document contains a <META> tag that mentions the document's
+           encoding.
+        """
         s=[]
         for c in self:
             text = None
-- 
cgit v1.2.3


From 5082ca97607846f3be18859f1b803ac15ef77083 Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonard.richardson@canonical.com>
Date: Sat, 26 Feb 2011 23:19:10 -0500
Subject: Removed the redundant pretty_print argument except for the top level.

---
 beautifulsoup/element.py | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

(limited to 'beautifulsoup/element.py')

diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index e6e7adb..519a9da 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -571,12 +571,11 @@ class Tag(PageElement):
         return self.encode()
 
     def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
-               pretty_print=False, indent_level=0,
-               substitute_html_entities=False):
-        return self.decode(pretty_print, indent_level, encoding,
+               indent_level=None, substitute_html_entities=False):
+        return self.decode(indent_level, encoding,
                            substitute_html_entities).encode(encoding)
 
-    def decode(self, pretty_print=False, indent_level=0,
+    def decode(self, indent_level=None,
                eventual_encoding=DEFAULT_OUTPUT_ENCODING,
                substitute_html_entities=False):
         """Returns a string or Unicode representation of this tag and
@@ -604,15 +603,18 @@ class Tag(PageElement):
         else:
             closeTag = '</%s>' % self.name
 
-        indentTag, indentContents = 0, 0
+        pretty_print = (indent_level is not None)
         if pretty_print:
-            indentTag = indent_level
-            space = (' ' * (indentTag-1))
-            indentContents = indentTag + 1
-        contents = self.decode_contents(pretty_print, indentContents,
-                                       eventual_encoding,
-                                       substitute_html_entities)
+            space = (' ' * (indent_level-1))
+            indent_contents = indent_level + 1
+        else:
+            space = ''
+            indent_contents = None
+        contents = self.decode_contents(
+            indent_contents, eventual_encoding, substitute_html_entities)
+
         if self.hidden:
+            # This is the 'document root' object.
             s = contents
         else:
             s = []
@@ -648,7 +650,7 @@ class Tag(PageElement):
     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
         return self.encode(encoding, True)
 
-    def decode_contents(self, pretty_print=False, indent_level=0,
+    def decode_contents(self, indent_level=None,
                        eventual_encoding=DEFAULT_OUTPUT_ENCODING,
                        substitute_html_entities=False):
         """Renders the contents of this tag as a Unicode string.
@@ -660,15 +662,16 @@ class Tag(PageElement):
            document contains a <META> tag that mentions the document's
            encoding.
         """
+        pretty_print = (indent_level is not None)
         s=[]
         for c in self:
             text = None
             if isinstance(c, NavigableString):
                 text = c.output_ready(substitute_html_entities)
             elif isinstance(c, Tag):
-                s.append(c.decode(pretty_print, indent_level, eventual_encoding,
+                s.append(c.decode(indent_level, eventual_encoding,
                                   substitute_html_entities))
-            if text and pretty_print:
+            if text and indent_level:
                 text = text.strip()
             if text:
                 if pretty_print:
-- 
cgit v1.2.3


From 1330639ff7ae099ce80e77a8b6be6a0d75b60f04 Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonard.richardson@canonical.com>
Date: Sat, 26 Feb 2011 23:22:42 -0500
Subject: Minor cleanup and adding docstrings.

---
 beautifulsoup/element.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'beautifulsoup/element.py')

diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index 519a9da..d37124f 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -562,7 +562,7 @@ class Tag(PageElement):
 
     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
         """Renders this tag as a string."""
-        return self.decode(eventual_encoding=encoding)
+        return self.encode(encoding)
 
     def __unicode__(self):
         return self.decode()
@@ -578,9 +578,15 @@ class Tag(PageElement):
     def decode(self, indent_level=None,
                eventual_encoding=DEFAULT_OUTPUT_ENCODING,
                substitute_html_entities=False):
-        """Returns a string or Unicode representation of this tag and
-        its contents. To get Unicode, pass None for encoding."""
+        """Returns a Unicode representation of this tag and its contents.
 
+        :param eventual_encoding: The tag is destined to be
+           encoded into this encoding. This method is _not_
+           responsible for performing that encoding. This information
+           is passed in so that it can be substituted in if the
+           document contains a <META> tag that mentions the document's
+           encoding.
+        """
         attrs = []
         if self.attrs:
             for key, val in self.attrs:
@@ -655,7 +661,7 @@ class Tag(PageElement):
                        substitute_html_entities=False):
         """Renders the contents of this tag as a Unicode string.
 
-        :param eventual_encoding: The document is destined to be
+        :param eventual_encoding: The tag is destined to be
            encoded into this encoding. This method is _not_
            responsible for performing that encoding. This information
            is passed in so that it can be substituted in if the
-- 
cgit v1.2.3


From 8a6e1b5e15368c9dd66b6b407b7328c2bd0360ad Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonard.richardson@canonical.com>
Date: Sat, 26 Feb 2011 23:39:06 -0500
Subject: The attribute list comes in as a dictionary, so stop turning it into
 a list for no reason. Saves code and a little time. Sort outgoing attributes
 so that the tests will run consistently.

---
 beautifulsoup/element.py | 51 ++++++++++--------------------------------------
 1 file changed, 10 insertions(+), 41 deletions(-)

(limited to 'beautifulsoup/element.py')

diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index d37124f..6af27a8 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -420,19 +420,14 @@ class Tag(PageElement):
         self.parserClass = parser.__class__
         self.name = name
         if attrs == None:
-            attrs = []
-        if isinstance(attrs, types.DictType):
-            self.attrMap = attrs
+            attrs = {}
+        else:
+            attrs = dict(attrs)
         self.attrs = attrs
         self.contents = []
         self.setup(parent, previous)
         self.hidden = False
 
-        if isinstance(attrs, types.DictType):
-            self.attrs = [kv for kv in attrs.items()]
-        else:
-            self.attrs = list(attrs)
-
         # Set up any substitutions, such as the charset in a META tag.
         self.contains_substitutions = builder.set_up_substitutions(self)
 
@@ -478,15 +473,15 @@ class Tag(PageElement):
         """Returns the value of the 'key' attribute for the tag, or
         the value given for 'default' if it doesn't have that
         attribute."""
-        return self._getAttrMap().get(key, default)
+        return self.attrs.get(key, default)
 
     def has_key(self, key):
-        return self._getAttrMap().has_key(key)
+        return self.attrs.has_key(key)
 
     def __getitem__(self, key):
         """tag[key] returns the value of the 'key' attribute for the tag,
         and throws an exception if it's not there."""
-        return self._getAttrMap()[key]
+        return self.attrs[key]
 
     def __iter__(self):
         "Iterating over a tag iterates over its contents."
@@ -506,27 +501,12 @@ class Tag(PageElement):
     def __setitem__(self, key, value):
         """Setting tag[key] sets the value of the 'key' attribute for the
         tag."""
-        self._getAttrMap()
-        self.attrMap[key] = value
-        found = False
-        for i in range(0, len(self.attrs)):
-            if self.attrs[i][0] == key:
-                self.attrs[i] = (key, value)
-                found = True
-        if not found:
-            self.attrs.append((key, value))
-        self._getAttrMap()[key] = value
+        self.attrs[key] = value
 
     def __delitem__(self, key):
         "Deleting tag[key] deletes all 'key' attributes for the tag."
-        for item in self.attrs:
-            if item[0] == key:
-                self.attrs.remove(item)
-                #We don't break because bad HTML can define the same
-                #attribute multiple times.
-            self._getAttrMap()
-            if self.attrMap.has_key(key):
-                del self.attrMap[key]
+        if self.attrs.has_key(key):
+            del self.attrs[key]
 
     def __call__(self, *args, **kwargs):
         """Calling a tag like a function is the same as calling its
@@ -589,7 +569,7 @@ class Tag(PageElement):
         """
         attrs = []
         if self.attrs:
-            for key, val in self.attrs:
+            for key, val in sorted(self.attrs.items()):
                 if val is None:
                     decoded = key
                 else:
@@ -718,17 +698,6 @@ class Tag(PageElement):
     findAll = find_all      # BS3
     findChildren = find_all # BS2
 
-    #Private methods
-
-    def _getAttrMap(self):
-        """Initializes a map representation of this tag's attributes,
-        if not already initialized."""
-        if not getattr(self, 'attrMap'):
-            self.attrMap = {}
-            for (key, value) in self.attrs:
-                self.attrMap[key] = value
-        return self.attrMap
-
     #Generator methods
     @property
     def children(self):
-- 
cgit v1.2.3