diff options
author | Aaron DeVore <aaron.devore@gmail.com> | 2011-03-05 00:47:57 -0800 |
---|---|---|
committer | Aaron DeVore <aaron.devore@gmail.com> | 2011-03-05 00:47:57 -0800 |
commit | 60fe2eebc961d4dfe41db60add7e8d1b8d1f53db (patch) | |
tree | 0f448ab70563960bea7504822b9c3d5c5e04a82a | |
parent | b01f9312a13198d249060dac34ab12629285cdb2 (diff) |
Add 3.0.7a -> 3.0.8 changes, plus some tweaks
-rw-r--r-- | bs4/element.py | 136 | ||||
-rw-r--r-- | tests/test_tree.py | 63 |
2 files changed, 161 insertions, 38 deletions
diff --git a/bs4/element.py b/bs4/element.py index 6fb6210..ffe13c5 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -11,6 +11,11 @@ from util import isList DEFAULT_OUTPUT_ENCODING = "utf-8" +def _match_css_class(str): + """Build a RE to match the given CSS class.""" + return re.compile(r"(^|.*\s)%s($|\s)" % str) + + class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -29,10 +34,10 @@ class PageElement(object): def replace_with(self, replace_with): oldParent = self.parent - myIndex = self.parent.contents.index(self) - if hasattr(replace_with, 'parent') and replace_with.parent == self.parent: + myIndex = self.parent.index(self) + if hasattr(replace_with, 'parent') and replace_with.parent is self.parent: # We're replacing this element with one of its siblings. - index = self.parent.contents.index(replace_with) + index = self.parent.index(replace_with) if index and index < myIndex: # Furthermore, it comes before this element. That # means that when we extract it, the index of this @@ -40,15 +45,20 @@ class PageElement(object): myIndex = myIndex - 1 self.extract() oldParent.insert(myIndex, replace_with) - replaceWith = replace_with # BS4 + replaceWith = replace_with # BS3 + + def replace_with_children(self): + my_parent = self.parent + my_index = self.parent.index(self) + self.extract() + for child in reversed(self.contents[:]): + my_parent.insert(my_index, child) + replaceWithChildren = replace_with_children def extract(self): """Destructively rips this element out of the tree.""" if self.parent: - try: - self.parent.contents.remove(self) - except ValueError: - pass + del self.parent.contents[self.parent.index(self)] #Find the two elements that would be next to each other if #this element (and any children) hadn't been parsed. Connect @@ -80,22 +90,20 @@ class PageElement(object): def insert(self, position, newChild): if (isinstance(newChild, basestring) - or isinstance(newChild, unicode)) \ - and not isinstance(newChild, NavigableString): + and not isinstance(newChild, NavigableString)): newChild = NavigableString(newChild) position = min(position, len(self.contents)) - if hasattr(newChild, 'parent') and newChild.parent != None: + if hasattr(newChild, 'parent') and newChild.parent is not None: # We're 'inserting' an element that's already one # of this object's children. - if newChild.parent == self: - index = self.find(newChild) - if index and index < position: + if newChild.parent is self: + if self.index(newChild) > position: # Furthermore we're moving it further down the # list of this object's children. That means that # when we extract this element, our target index # will jump down one. - position = position - 1 + position -= 1 newChild.extract() newChild.parent = self @@ -239,6 +247,17 @@ class PageElement(object): if isinstance(name, SoupStrainer): strainer = name + elif text is None and not limit and not attrs and not kwargs: + # findAll*(True) + if name is True or name is None: + return [element for element in generator + if isinstance(element, Tag)] + # findAll*('tag-name') + elif isinstance(name, basestring): + return [element for element in generator + if isinstance(element, Tag) and element.name == name] + else: + strainer = SoupStrainer(name, attrs, text, **kwargs) else: # Build a SoupStrainer strainer = SoupStrainer(name, attrs, text, **kwargs) @@ -261,35 +280,35 @@ class PageElement(object): @property def next_elements(self): i = self - while i: + while i is not None: i = i.next yield i @property def next_siblings(self): i = self - while i: + while i is not None: i = i.nextSibling yield i @property def previous_elements(self): i = self - while i: + while i is not None: i = i.previous yield i @property def previous_siblings(self): i = self - while i: + while i is not None: i = i.previousSibling yield i @property def parents(self): i = self - while i: + while i is not None: i = i.parent yield i @@ -404,7 +423,7 @@ class Tag(PageElement): # chunks be garbage-collected. self.parserClass = parser.__class__ self.name = name - if attrs == None: + if attrs is None: attrs = {} else: attrs = dict(attrs) @@ -454,6 +473,60 @@ class Tag(PageElement): return child return child.string + @string.setter + def string(self, string): + self.clear() + self.append(string) + + def get_text(self, separator=u"", strip=False): + """ + Get all child strings, concatenated using the given separator + """ + if strip: + return separator.join(string.strip() + for string in self.recursive_children + if isinstance(string, NavigableString) and string.strip()) + else: + return separator.join(string + for string in self.recursive_children + if isinstance(string, NavigableString)) + getText = get_text + + text = property(get_text) + + def decompose(self): + """Recursively destroys the contents of this tree.""" + self.extract() + i = self + while i is not None: + next = i.next + i.__dict__.clear() + i = next + + def clear(self, decompose=False): + """ + Extract all children. If decompose is True, decompose instead. + """ + if decompose: + for element in self.contents[:]: + if isinstance(element, Tag): + element.decompose() + else: + element.extract() + else: + for element in self.contents[:]: + element.extract() + + def index(self, element): + """ + Find the index of a child by identity, not value. Avoids issues with + tag.contents.index(element) getting the index of equal elements. + """ + for i, child in enumerate(self.contents): + if child is element: + return i + raise ValueError("Tag.index: element not in tag") + def get(self, key, default=None): """Returns the value of the 'key' attribute for the tag, or the value given for 'default' if it doesn't have that @@ -510,6 +583,8 @@ class Tag(PageElement): def __eq__(self, other): """Returns true iff this tag has the same name, the same attributes, and the same contents (recursively) as the given tag.""" + if self is other: + return True if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): return False for i in range(0, len(self.contents)): @@ -606,16 +681,6 @@ class Tag(PageElement): s = ''.join(s) return s - def decompose(self): - """Recursively destroys the contents of this tree.""" - contents = [i for i in self.contents] - for i in contents: - if isinstance(i, Tag): - i.decompose() - else: - i.extract() - self.extract() - def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): return self.encode(encoding, True) @@ -684,12 +749,13 @@ class Tag(PageElement): #Generator methods @property def children(self): + # return iter() to make the purpose of the method clear return iter(self.contents) # XXX This seems to be untested. @property def recursive_children(self): if not len(self.contents): - raise StopIteration # XXX return instead? + return stopNode = self._last_recursive_child().next current = self.contents[0] while current is not stopNode: @@ -712,7 +778,7 @@ class SoupStrainer(object): def __init__(self, name=None, attrs={}, text=None, **kwargs): self.name = name if isinstance(attrs, basestring): - kwargs['class'] = attrs + kwargs['class'] = _match_css_class(attrs) attrs = None if kwargs: if attrs: @@ -795,8 +861,8 @@ class SoupStrainer(object): def _matches(self, markup, matchAgainst): #print "Matching %s against %s" % (markup, matchAgainst) result = False - if matchAgainst == True and type(matchAgainst) == types.BooleanType: - result = markup != None + if matchAgainst is True: + result = markup is not None elif callable(matchAgainst): result = matchAgainst(markup) else: diff --git a/tests/test_tree.py b/tests/test_tree.py index f2989fe..646c677 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -179,10 +179,13 @@ class TestFindAllByAttribute(TreeTest): tree = self.soup(""" <a class="1">Class 1.</a> <a class="2">Class 2.</a> - <b class="1">Class 1.</a> + <b class="1">Class 1.</b> + <c class="3 4">Class 3 and 4.</c> """) self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) + self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) + self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.']) def test_find_all_by_attribute_soupstrainer(self): tree = self.soup(""" @@ -242,6 +245,24 @@ class TestFindAllByAttribute(TreeTest): ["One a.", "Two as."]) +class TestIndex(TreeTest): + """Test Tag.index""" + def test_index(self): + tree = self.soup("""<wrap> + <a>Identical</a> + <b>Not identical</b> + <a>Identical</a> + + <c><d>Identical with child</d></c> + <b>Also not identical</b> + <c><d>Identical with child</d></c> + </wrap>""") + wrap = tree.wrap + for i, element in enumerate(wrap.contents): + self.assertEqual(i, wrap.index(element)) + self.assertRaises(ValueError, tree.index, 1) + + class TestParentOperations(TreeTest): """Test navigation and searching through an element's parents.""" @@ -591,7 +612,6 @@ class TestTreeModification(SoupTest): self.assertEqual(new_text.nextSibling, None) self.assertEqual(new_text.next, soup.c) - def test_insert_tag(self): builder = self.default_builder soup = self.soup( @@ -682,6 +702,14 @@ class TestTreeModification(SoupTest): self.assertEqual(g_tag.previous, to_text) self.assertEqual(g_tag.previousSibling, to_text) + def test_replace_with_children(self): + tree = self.soup(""" + <p>Unneeded <em>formatting</em> is unneeded</p> + """) + tree.em.replace_with_children() + self.assertEqual(tree.em, None) + self.assertEqual(tree.p.text, "Unneeded formatting is unneeded") + def test_extract(self): soup = self.soup( '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>') @@ -707,6 +735,28 @@ class TestTreeModification(SoupTest): self.assertEquals(content_2.previous, content_1) self.assertEquals(content_2.previousSibling, content_1) + def test_clear(self): + """Tag.clear()""" + soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>") + # clear using extract() + a = soup.a + soup.p.clear() + self.assertEqual(len(soup.p.contents), 0) + self.assertTrue(hasattr(a, "contents")) + + # clear using decompose() + em = a.em + a.clear(decompose=True) + self.assertFalse(hasattr(em, "contents")) + + def test_string_set(self): + """Tag.string = 'string'""" + soup = self.soup("<a></a> <b><c></c></b>") + soup.a.string = "foo" + self.assertEqual(soup.a.contents, ["foo"]) + soup.b.string = "bar" + self.assertEqual(soup.b.contents, ["bar"]) + class TestElementObjects(SoupTest): """Test various features of element objects.""" @@ -781,7 +831,6 @@ class TestElementObjects(SoupTest): self.assertEqual(soup.a.string, "foo") self.assertEqual(soup.string, "foo") - def test_lack_of_string(self): """Only a tag containing a single text node has a .string.""" soup = self.soup("<b>f<i>e</i>o</b>") @@ -790,6 +839,14 @@ class TestElementObjects(SoupTest): soup = self.soup("<b></b>") self.assertFalse(soup.b.string) + def test_all_text(self): + """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated""" + soup = self.soup("<a>a<b>r</b> <r> t </r></a>") + self.assertEqual(soup.a.text, "ar t ") + self.assertEqual(soup.a.get_text(strip=True), "art") + self.assertEqual(soup.a.get_text(","), "a,r, , t ") + self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") + class TestPersistence(SoupTest): "Testing features like pickle and deepcopy." |