summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAaron DeVore <aaron.devore@gmail.com>2011-03-05 00:47:57 -0800
committerAaron DeVore <aaron.devore@gmail.com>2011-03-05 00:47:57 -0800
commit60fe2eebc961d4dfe41db60add7e8d1b8d1f53db (patch)
tree0f448ab70563960bea7504822b9c3d5c5e04a82a
parentb01f9312a13198d249060dac34ab12629285cdb2 (diff)
Add 3.0.7a -> 3.0.8 changes, plus some tweaks
-rw-r--r--bs4/element.py136
-rw-r--r--tests/test_tree.py63
2 files changed, 161 insertions, 38 deletions
diff --git a/bs4/element.py b/bs4/element.py
index 6fb6210..ffe13c5 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -11,6 +11,11 @@ from util import isList
DEFAULT_OUTPUT_ENCODING = "utf-8"
+def _match_css_class(str):
+ """Build a RE to match the given CSS class."""
+ return re.compile(r"(^|.*\s)%s($|\s)" % str)
+
+
class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
@@ -29,10 +34,10 @@ class PageElement(object):
def replace_with(self, replace_with):
oldParent = self.parent
- myIndex = self.parent.contents.index(self)
- if hasattr(replace_with, 'parent') and replace_with.parent == self.parent:
+ myIndex = self.parent.index(self)
+ if hasattr(replace_with, 'parent') and replace_with.parent is self.parent:
# We're replacing this element with one of its siblings.
- index = self.parent.contents.index(replace_with)
+ index = self.parent.index(replace_with)
if index and index < myIndex:
# Furthermore, it comes before this element. That
# means that when we extract it, the index of this
@@ -40,15 +45,20 @@ class PageElement(object):
myIndex = myIndex - 1
self.extract()
oldParent.insert(myIndex, replace_with)
- replaceWith = replace_with # BS4
+ replaceWith = replace_with # BS3
+
+ def replace_with_children(self):
+ my_parent = self.parent
+ my_index = self.parent.index(self)
+ self.extract()
+ for child in reversed(self.contents[:]):
+ my_parent.insert(my_index, child)
+ replaceWithChildren = replace_with_children
def extract(self):
"""Destructively rips this element out of the tree."""
if self.parent:
- try:
- self.parent.contents.remove(self)
- except ValueError:
- pass
+ del self.parent.contents[self.parent.index(self)]
#Find the two elements that would be next to each other if
#this element (and any children) hadn't been parsed. Connect
@@ -80,22 +90,20 @@ class PageElement(object):
def insert(self, position, newChild):
if (isinstance(newChild, basestring)
- or isinstance(newChild, unicode)) \
- and not isinstance(newChild, NavigableString):
+ and not isinstance(newChild, NavigableString)):
newChild = NavigableString(newChild)
position = min(position, len(self.contents))
- if hasattr(newChild, 'parent') and newChild.parent != None:
+ if hasattr(newChild, 'parent') and newChild.parent is not None:
# We're 'inserting' an element that's already one
# of this object's children.
- if newChild.parent == self:
- index = self.find(newChild)
- if index and index < position:
+ if newChild.parent is self:
+ if self.index(newChild) > position:
# Furthermore we're moving it further down the
# list of this object's children. That means that
# when we extract this element, our target index
# will jump down one.
- position = position - 1
+ position -= 1
newChild.extract()
newChild.parent = self
@@ -239,6 +247,17 @@ class PageElement(object):
if isinstance(name, SoupStrainer):
strainer = name
+ elif text is None and not limit and not attrs and not kwargs:
+ # findAll*(True)
+ if name is True or name is None:
+ return [element for element in generator
+ if isinstance(element, Tag)]
+ # findAll*('tag-name')
+ elif isinstance(name, basestring):
+ return [element for element in generator
+ if isinstance(element, Tag) and element.name == name]
+ else:
+ strainer = SoupStrainer(name, attrs, text, **kwargs)
else:
# Build a SoupStrainer
strainer = SoupStrainer(name, attrs, text, **kwargs)
@@ -261,35 +280,35 @@ class PageElement(object):
@property
def next_elements(self):
i = self
- while i:
+ while i is not None:
i = i.next
yield i
@property
def next_siblings(self):
i = self
- while i:
+ while i is not None:
i = i.nextSibling
yield i
@property
def previous_elements(self):
i = self
- while i:
+ while i is not None:
i = i.previous
yield i
@property
def previous_siblings(self):
i = self
- while i:
+ while i is not None:
i = i.previousSibling
yield i
@property
def parents(self):
i = self
- while i:
+ while i is not None:
i = i.parent
yield i
@@ -404,7 +423,7 @@ class Tag(PageElement):
# chunks be garbage-collected.
self.parserClass = parser.__class__
self.name = name
- if attrs == None:
+ if attrs is None:
attrs = {}
else:
attrs = dict(attrs)
@@ -454,6 +473,60 @@ class Tag(PageElement):
return child
return child.string
+ @string.setter
+ def string(self, string):
+ self.clear()
+ self.append(string)
+
+ def get_text(self, separator=u"", strip=False):
+ """
+ Get all child strings, concatenated using the given separator
+ """
+ if strip:
+ return separator.join(string.strip()
+ for string in self.recursive_children
+ if isinstance(string, NavigableString) and string.strip())
+ else:
+ return separator.join(string
+ for string in self.recursive_children
+ if isinstance(string, NavigableString))
+ getText = get_text
+
+ text = property(get_text)
+
+ def decompose(self):
+ """Recursively destroys the contents of this tree."""
+ self.extract()
+ i = self
+ while i is not None:
+ next = i.next
+ i.__dict__.clear()
+ i = next
+
+ def clear(self, decompose=False):
+ """
+ Extract all children. If decompose is True, decompose instead.
+ """
+ if decompose:
+ for element in self.contents[:]:
+ if isinstance(element, Tag):
+ element.decompose()
+ else:
+ element.extract()
+ else:
+ for element in self.contents[:]:
+ element.extract()
+
+ def index(self, element):
+ """
+ Find the index of a child by identity, not value. Avoids issues with
+ tag.contents.index(element) getting the index of equal elements.
+ """
+ for i, child in enumerate(self.contents):
+ if child is element:
+ return i
+ raise ValueError("Tag.index: element not in tag")
+
def get(self, key, default=None):
"""Returns the value of the 'key' attribute for the tag, or
the value given for 'default' if it doesn't have that
@@ -510,6 +583,8 @@ class Tag(PageElement):
def __eq__(self, other):
"""Returns true iff this tag has the same name, the same attributes,
and the same contents (recursively) as the given tag."""
+ if self is other:
+ return True
if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
return False
for i in range(0, len(self.contents)):
@@ -606,16 +681,6 @@ class Tag(PageElement):
s = ''.join(s)
return s
- def decompose(self):
- """Recursively destroys the contents of this tree."""
- contents = [i for i in self.contents]
- for i in contents:
- if isinstance(i, Tag):
- i.decompose()
- else:
- i.extract()
- self.extract()
-
def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
return self.encode(encoding, True)
@@ -684,12 +749,13 @@ class Tag(PageElement):
#Generator methods
@property
def children(self):
+ # return iter() to make the purpose of the method clear
return iter(self.contents) # XXX This seems to be untested.
@property
def recursive_children(self):
if not len(self.contents):
- raise StopIteration # XXX return instead?
+ return
stopNode = self._last_recursive_child().next
current = self.contents[0]
while current is not stopNode:
@@ -712,7 +778,7 @@ class SoupStrainer(object):
def __init__(self, name=None, attrs={}, text=None, **kwargs):
self.name = name
if isinstance(attrs, basestring):
- kwargs['class'] = attrs
+ kwargs['class'] = _match_css_class(attrs)
attrs = None
if kwargs:
if attrs:
@@ -795,8 +861,8 @@ class SoupStrainer(object):
def _matches(self, markup, matchAgainst):
#print "Matching %s against %s" % (markup, matchAgainst)
result = False
- if matchAgainst == True and type(matchAgainst) == types.BooleanType:
- result = markup != None
+ if matchAgainst is True:
+ result = markup is not None
elif callable(matchAgainst):
result = matchAgainst(markup)
else:
diff --git a/tests/test_tree.py b/tests/test_tree.py
index f2989fe..646c677 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -179,10 +179,13 @@ class TestFindAllByAttribute(TreeTest):
tree = self.soup("""
<a class="1">Class 1.</a>
<a class="2">Class 2.</a>
- <b class="1">Class 1.</a>
+ <b class="1">Class 1.</b>
+ <c class="3 4">Class 3 and 4.</c>
""")
self.assertSelects(tree.find_all('a', '1'), ['Class 1.'])
self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.'])
+ self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.'])
+ self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.'])
def test_find_all_by_attribute_soupstrainer(self):
tree = self.soup("""
@@ -242,6 +245,24 @@ class TestFindAllByAttribute(TreeTest):
["One a.", "Two as."])
+class TestIndex(TreeTest):
+ """Test Tag.index"""
+ def test_index(self):
+ tree = self.soup("""<wrap>
+ <a>Identical</a>
+ <b>Not identical</b>
+ <a>Identical</a>
+
+ <c><d>Identical with child</d></c>
+ <b>Also not identical</b>
+ <c><d>Identical with child</d></c>
+ </wrap>""")
+ wrap = tree.wrap
+ for i, element in enumerate(wrap.contents):
+ self.assertEqual(i, wrap.index(element))
+ self.assertRaises(ValueError, tree.index, 1)
+
+
class TestParentOperations(TreeTest):
"""Test navigation and searching through an element's parents."""
@@ -591,7 +612,6 @@ class TestTreeModification(SoupTest):
self.assertEqual(new_text.nextSibling, None)
self.assertEqual(new_text.next, soup.c)
-
def test_insert_tag(self):
builder = self.default_builder
soup = self.soup(
@@ -682,6 +702,14 @@ class TestTreeModification(SoupTest):
self.assertEqual(g_tag.previous, to_text)
self.assertEqual(g_tag.previousSibling, to_text)
+ def test_replace_with_children(self):
+ tree = self.soup("""
+ <p>Unneeded <em>formatting</em> is unneeded</p>
+ """)
+ tree.em.replace_with_children()
+ self.assertEqual(tree.em, None)
+ self.assertEqual(tree.p.text, "Unneeded formatting is unneeded")
+
def test_extract(self):
soup = self.soup(
'<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>')
@@ -707,6 +735,28 @@ class TestTreeModification(SoupTest):
self.assertEquals(content_2.previous, content_1)
self.assertEquals(content_2.previousSibling, content_1)
+ def test_clear(self):
+ """Tag.clear()"""
+ soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>")
+ # clear using extract()
+ a = soup.a
+ soup.p.clear()
+ self.assertEqual(len(soup.p.contents), 0)
+ self.assertTrue(hasattr(a, "contents"))
+
+ # clear using decompose()
+ em = a.em
+ a.clear(decompose=True)
+ self.assertFalse(hasattr(em, "contents"))
+
+ def test_string_set(self):
+ """Tag.string = 'string'"""
+ soup = self.soup("<a></a> <b><c></c></b>")
+ soup.a.string = "foo"
+ self.assertEqual(soup.a.contents, ["foo"])
+ soup.b.string = "bar"
+ self.assertEqual(soup.b.contents, ["bar"])
+
class TestElementObjects(SoupTest):
"""Test various features of element objects."""
@@ -781,7 +831,6 @@ class TestElementObjects(SoupTest):
self.assertEqual(soup.a.string, "foo")
self.assertEqual(soup.string, "foo")
-
def test_lack_of_string(self):
"""Only a tag containing a single text node has a .string."""
soup = self.soup("<b>f<i>e</i>o</b>")
@@ -790,6 +839,14 @@ class TestElementObjects(SoupTest):
soup = self.soup("<b></b>")
self.assertFalse(soup.b.string)
+ def test_all_text(self):
+ """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated"""
+ soup = self.soup("<a>a<b>r</b> <r> t </r></a>")
+ self.assertEqual(soup.a.text, "ar t ")
+ self.assertEqual(soup.a.get_text(strip=True), "art")
+ self.assertEqual(soup.a.get_text(","), "a,r, , t ")
+ self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
+
class TestPersistence(SoupTest):
"Testing features like pickle and deepcopy."