Add 3.0.7a -> 3.0.8 changes, plus some tweaks

author: Aaron DeVore <aaron.devore@gmail.com> 2011-03-05 00:47:57 -0800
committer: Aaron DeVore <aaron.devore@gmail.com> 2011-03-05 00:47:57 -0800
commit: 60fe2eebc961d4dfe41db60add7e8d1b8d1f53db (patch)
tree: 0f448ab70563960bea7504822b9c3d5c5e04a82a
parent: b01f9312a13198d249060dac34ab12629285cdb2 (diff)
2 files changed, 161 insertions, 38 deletions
diff --git a/bs4/element.py b/bs4/element.py
index 6fb6210..ffe13c5 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -11,6 +11,11 @@ from util import isList
 DEFAULT_OUTPUT_ENCODING = "utf-8"
 
 
+def _match_css_class(str):
+    """Build a RE to match the given CSS class."""
+    return re.compile(r"(^|.*\s)%s($|\s)" % str)
+
+
 class PageElement(object):
     """Contains the navigational information for some part of the page
     (either a tag or a piece of text)"""
@@ -29,10 +34,10 @@ class PageElement(object):
 
     def replace_with(self, replace_with):
         oldParent = self.parent
-        myIndex = self.parent.contents.index(self)
-        if hasattr(replace_with, 'parent') and replace_with.parent == self.parent:
+        myIndex = self.parent.index(self)
+        if hasattr(replace_with, 'parent') and replace_with.parent is self.parent:
             # We're replacing this element with one of its siblings.
-            index = self.parent.contents.index(replace_with)
+            index = self.parent.index(replace_with)
             if index and index < myIndex:
                 # Furthermore, it comes before this element. That
                 # means that when we extract it, the index of this
@@ -40,15 +45,20 @@ class PageElement(object):
                 myIndex = myIndex - 1
         self.extract()
         oldParent.insert(myIndex, replace_with)
-    replaceWith = replace_with # BS4
+    replaceWith = replace_with # BS3
+
+    def replace_with_children(self):
+        my_parent = self.parent
+        my_index = self.parent.index(self)
+        self.extract()
+        for child in reversed(self.contents[:]):
+            my_parent.insert(my_index, child)
+    replaceWithChildren = replace_with_children
 
     def extract(self):
         """Destructively rips this element out of the tree."""
         if self.parent:
-            try:
-                self.parent.contents.remove(self)
-            except ValueError:
-                pass
+            del self.parent.contents[self.parent.index(self)]
 
         #Find the two elements that would be next to each other if
         #this element (and any children) hadn't been parsed. Connect
@@ -80,22 +90,20 @@ class PageElement(object):
 
     def insert(self, position, newChild):
         if (isinstance(newChild, basestring)
-            or isinstance(newChild, unicode)) \
-            and not isinstance(newChild, NavigableString):
+            and not isinstance(newChild, NavigableString)):
             newChild = NavigableString(newChild)
 
         position =  min(position, len(self.contents))
-        if hasattr(newChild, 'parent') and newChild.parent != None:
+        if hasattr(newChild, 'parent') and newChild.parent is not None:
             # We're 'inserting' an element that's already one
             # of this object's children.
-            if newChild.parent == self:
-                index = self.find(newChild)
-                if index and index < position:
+            if newChild.parent is self:
+                if self.index(newChild) > position:
                     # Furthermore we're moving it further down the
                     # list of this object's children. That means that
                     # when we extract this element, our target index
                     # will jump down one.
-                    position = position - 1
+                    position -= 1
             newChild.extract()
 
         newChild.parent = self
@@ -239,6 +247,17 @@ class PageElement(object):
 
         if isinstance(name, SoupStrainer):
             strainer = name
+        elif text is None and not limit and not attrs and not kwargs:
+            # findAll*(True)
+            if name is True or name is None:
+                return [element for element in generator
+                        if isinstance(element, Tag)]
+            # findAll*('tag-name')
+            elif isinstance(name, basestring):
+                return [element for element in generator
+                        if isinstance(element, Tag) and element.name == name]
+            else:
+                strainer = SoupStrainer(name, attrs, text, **kwargs)
         else:
             # Build a SoupStrainer
             strainer = SoupStrainer(name, attrs, text, **kwargs)
@@ -261,35 +280,35 @@ class PageElement(object):
     @property
     def next_elements(self):
         i = self
-        while i:
+        while i is not None:
             i = i.next
             yield i
 
     @property
     def next_siblings(self):
         i = self
-        while i:
+        while i is not None:
             i = i.nextSibling
             yield i
 
     @property
     def previous_elements(self):
         i = self
-        while i:
+        while i is not None:
             i = i.previous
             yield i
 
     @property
     def previous_siblings(self):
         i = self
-        while i:
+        while i is not None:
             i = i.previousSibling
             yield i
 
     @property
     def parents(self):
         i = self
-        while i:
+        while i is not None:
             i = i.parent
             yield i
 
@@ -404,7 +423,7 @@ class Tag(PageElement):
         # chunks be garbage-collected.
         self.parserClass = parser.__class__
         self.name = name
-        if attrs == None:
+        if attrs is None:
             attrs = {}
         else:
             attrs = dict(attrs)
@@ -454,6 +473,60 @@ class Tag(PageElement):
             return child
         return child.string
 
+    @string.setter
+    def string(self, string):
+        self.clear()
+        self.append(string)
+
+    def get_text(self, separator=u"", strip=False):
+        """
+        Get all child strings, concatenated using the given separator
+        """
+        if strip:
+            return separator.join(string.strip()
+                for string in self.recursive_children
+                if isinstance(string, NavigableString) and string.strip())
+        else:
+            return separator.join(string
+                for string in self.recursive_children
+                if isinstance(string, NavigableString))
+    getText = get_text
+
+    text = property(get_text)
+
+    def decompose(self):
+        """Recursively destroys the contents of this tree."""
+        self.extract()
+        i = self
+        while i is not None:
+            next = i.next
+            i.__dict__.clear()
+            i = next
+
+    def clear(self, decompose=False):
+        """
+        Extract all children. If decompose is True, decompose instead.
+        """
+        if decompose:
+            for element in self.contents[:]:
+                if isinstance(element, Tag):
+                    element.decompose()
+                else:
+                    element.extract()
+        else:
+            for element in self.contents[:]:
+                element.extract()
+
+    def index(self, element):
+        """
+        Find the index of a child by identity, not value. Avoids issues with
+        tag.contents.index(element) getting the index of equal elements.
+        """
+        for i, child in enumerate(self.contents):
+            if child is element:
+                return i
+        raise ValueError("Tag.index: element not in tag")
+
     def get(self, key, default=None):
         """Returns the value of the 'key' attribute for the tag, or
         the value given for 'default' if it doesn't have that
@@ -510,6 +583,8 @@ class Tag(PageElement):
     def __eq__(self, other):
         """Returns true iff this tag has the same name, the same attributes,
         and the same contents (recursively) as the given tag."""
+        if self is other:
+            return True
         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
             return False
         for i in range(0, len(self.contents)):
@@ -606,16 +681,6 @@ class Tag(PageElement):
             s = ''.join(s)
         return s
 
-    def decompose(self):
-        """Recursively destroys the contents of this tree."""
-        contents = [i for i in self.contents]
-        for i in contents:
-            if isinstance(i, Tag):
-                i.decompose()
-            else:
-                i.extract()
-        self.extract()
-
     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
         return self.encode(encoding, True)
 
@@ -684,12 +749,13 @@ class Tag(PageElement):
     #Generator methods
     @property
     def children(self):
+        # return iter() to make the purpose of the method clear
         return iter(self.contents) # XXX This seems to be untested.
 
     @property
     def recursive_children(self):
         if not len(self.contents):
-            raise StopIteration # XXX return instead?
+            return
         stopNode = self._last_recursive_child().next
         current = self.contents[0]
         while current is not stopNode:
@@ -712,7 +778,7 @@ class SoupStrainer(object):
     def __init__(self, name=None, attrs={}, text=None, **kwargs):
         self.name = name
         if isinstance(attrs, basestring):
-            kwargs['class'] = attrs
+            kwargs['class'] = _match_css_class(attrs)
             attrs = None
         if kwargs:
             if attrs:
@@ -795,8 +861,8 @@ class SoupStrainer(object):
     def _matches(self, markup, matchAgainst):
         #print "Matching %s against %s" % (markup, matchAgainst)
         result = False
-        if matchAgainst == True and type(matchAgainst) == types.BooleanType:
-            result = markup != None
+        if matchAgainst is True:
+            result = markup is not None
         elif callable(matchAgainst):
             result = matchAgainst(markup)
         else:
diff --git a/tests/test_tree.py b/tests/test_tree.py
index f2989fe..646c677 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -179,10 +179,13 @@ class TestFindAllByAttribute(TreeTest):
         tree = self.soup("""
                          <a class="1">Class 1.</a>
                          <a class="2">Class 2.</a>
-                         <b class="1">Class 1.</a>
+                         <b class="1">Class 1.</b>
+                         <c class="3 4">Class 3 and 4.</c>
                          """)
         self.assertSelects(tree.find_all('a', '1'), ['Class 1.'])
         self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.'])
+        self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.'])
+        self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.'])
 
     def test_find_all_by_attribute_soupstrainer(self):
         tree = self.soup("""
@@ -242,6 +245,24 @@ class TestFindAllByAttribute(TreeTest):
                            ["One a.", "Two as."])
 
 
+class TestIndex(TreeTest):
+    """Test Tag.index"""
+    def test_index(self):
+        tree = self.soup("""<wrap>
+                            <a>Identical</a>
+                            <b>Not identical</b>
+                            <a>Identical</a>
+
+                            <c><d>Identical with child</d></c>
+                            <b>Also not identical</b>
+                            <c><d>Identical with child</d></c>
+                            </wrap>""")
+        wrap = tree.wrap
+        for i, element in enumerate(wrap.contents):
+            self.assertEqual(i, wrap.index(element))
+        self.assertRaises(ValueError, tree.index, 1)
+
+
 class TestParentOperations(TreeTest):
     """Test navigation and searching through an element's parents."""
 
@@ -591,7 +612,6 @@ class TestTreeModification(SoupTest):
         self.assertEqual(new_text.nextSibling, None)
         self.assertEqual(new_text.next, soup.c)
 
-
     def test_insert_tag(self):
         builder = self.default_builder
         soup = self.soup(
@@ -682,6 +702,14 @@ class TestTreeModification(SoupTest):
         self.assertEqual(g_tag.previous, to_text)
         self.assertEqual(g_tag.previousSibling, to_text)
 
+    def test_replace_with_children(self):
+        tree = self.soup("""
+            <p>Unneeded <em>formatting</em> is unneeded</p>
+            """)
+        tree.em.replace_with_children()
+        self.assertEqual(tree.em, None)
+        self.assertEqual(tree.p.text, "Unneeded formatting is unneeded")
+
     def test_extract(self):
         soup = self.soup(
             '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>')
@@ -707,6 +735,28 @@ class TestTreeModification(SoupTest):
         self.assertEquals(content_2.previous, content_1)
         self.assertEquals(content_2.previousSibling, content_1)
 
+    def test_clear(self):
+        """Tag.clear()"""
+        soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>")
+        # clear using extract()
+        a = soup.a
+        soup.p.clear()
+        self.assertEqual(len(soup.p.contents), 0)
+        self.assertTrue(hasattr(a, "contents"))
+
+        # clear using decompose()
+        em = a.em
+        a.clear(decompose=True)
+        self.assertFalse(hasattr(em, "contents"))
+
+    def test_string_set(self):
+        """Tag.string = 'string'"""
+        soup = self.soup("<a></a> <b><c></c></b>")
+        soup.a.string = "foo"
+        self.assertEqual(soup.a.contents, ["foo"])
+        soup.b.string = "bar"
+        self.assertEqual(soup.b.contents, ["bar"])
+
 
 class TestElementObjects(SoupTest):
     """Test various features of element objects."""
@@ -781,7 +831,6 @@ class TestElementObjects(SoupTest):
         self.assertEqual(soup.a.string, "foo")
         self.assertEqual(soup.string, "foo")
 
-
     def test_lack_of_string(self):
         """Only a tag containing a single text node has a .string."""
         soup = self.soup("<b>f<i>e</i>o</b>")
@@ -790,6 +839,14 @@ class TestElementObjects(SoupTest):
         soup = self.soup("<b></b>")
         self.assertFalse(soup.b.string)
 
+    def test_all_text(self):
+        """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated"""
+        soup = self.soup("<a>a<b>r</b>   <r> t </r></a>")
+        self.assertEqual(soup.a.text, "ar  t ")
+        self.assertEqual(soup.a.get_text(strip=True), "art")
+        self.assertEqual(soup.a.get_text(","), "a,r, , t ")
+        self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
+
 
 class TestPersistence(SoupTest):
     "Testing features like pickle and deepcopy."
author	Aaron DeVore <aaron.devore@gmail.com>	2011-03-05 00:47:57 -0800
committer	Aaron DeVore <aaron.devore@gmail.com>	2011-03-05 00:47:57 -0800
commit	60fe2eebc961d4dfe41db60add7e8d1b8d1f53db (patch)
tree	0f448ab70563960bea7504822b9c3d5c5e04a82a
parent	b01f9312a13198d249060dac34ab12629285cdb2 (diff)