diff options
-rw-r--r-- | CHANGELOG | 2 | ||||
-rw-r--r-- | bs4/__init__.py | 16 | ||||
-rw-r--r-- | bs4/element.py | 64 | ||||
-rw-r--r-- | tests/test_tree.py | 62 |
4 files changed, 76 insertions, 68 deletions
@@ -39,6 +39,8 @@ Some attributes have also been renamed: * Tag.isSelfClosing -> Tag.is_empty_element * UnicodeDammit.unicode -> UnicodeDammit.unicode_markup + * Tag.next -> Tag.next_element + * Tag.previous -> Tag.previous_element So have some arguments to popular methods: diff --git a/bs4/__init__.py b/bs4/__init__.py index 22ecc43..6406bef 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -159,10 +159,10 @@ class BeautifulSoup(Tag): def object_was_parsed(self, o): """Add an object to the parse tree.""" - o.setup(self.currentTag, self.previous) - if self.previous: - self.previous.next = o - self.previous = o + o.setup(self.currentTag, self.previous_element) + if self.previous_element: + self.previous_element.next_element = o + self.previous_element = o self.currentTag.contents.append(o) @@ -206,12 +206,12 @@ class BeautifulSoup(Tag): return None tag = Tag(self, self.builder, name, attrs, self.currentTag, - self.previous) + self.previous_element) if tag is None: return tag - if self.previous: - self.previous.next = tag - self.previous = tag + if self.previous_element: + self.previous_element.next_element = tag + self.previous_element = tag self.pushTag(tag) return tag diff --git a/bs4/element.py b/bs4/element.py index 587078c..729ab36 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1,9 +1,6 @@ +import collections import re import types -try: - from htmlentitydefs import name2codepoint -except ImportError: - name2codepoint = {} from bs4.dammit import EntitySubstitution DEFAULT_OUTPUT_ENCODING = "utf-8" @@ -13,12 +10,12 @@ class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" - def setup(self, parent=None, previous=None): + def setup(self, parent=None, previous_element=None): """Sets up the initial relations between this element and other elements.""" self.parent = parent - self.previous = previous - self.next = None + self.previous_element = previous_element + self.next_element = None self.previousSibling = None self.nextSibling = None if self.parent and self.parent.contents: @@ -52,14 +49,14 @@ class PageElement(object): #this element (and any children) hadn't been parsed. Connect #the two. lastChild = self._last_recursive_child() - nextElement = lastChild.next + nextElement = lastChild.next_element - if self.previous: - self.previous.next = nextElement + if self.previous_element: + self.previous_element.next_element = nextElement if nextElement: - nextElement.previous = self.previous - self.previous = None - lastChild.next = None + nextElement.previous_element = self.previous_element + self.previous_element = None + lastChild.next_element = None self.parent = None if self.previousSibling: @@ -100,14 +97,14 @@ class PageElement(object): previousChild = None if position == 0: newChild.previousSibling = None - newChild.previous = self + newChild.previous_element = self else: previousChild = self.contents[position-1] newChild.previousSibling = previousChild newChild.previousSibling.nextSibling = newChild - newChild.previous = previousChild._last_recursive_child() - if newChild.previous: - newChild.previous.next = newChild + newChild.previous_element = previousChild._last_recursive_child() + if newChild.previous_element: + newChild.previous_element.next_element = newChild newChildsLastElement = newChild._last_recursive_child() @@ -122,18 +119,18 @@ class PageElement(object): if not parent: # This is the last element in the document. break if parentsNextSibling: - newChildsLastElement.next = parentsNextSibling + newChildsLastElement.next_element = parentsNextSibling else: - newChildsLastElement.next = None + newChildsLastElement.next_element = None else: nextChild = self.contents[position] newChild.nextSibling = nextChild if newChild.nextSibling: newChild.nextSibling.previousSibling = newChild - newChildsLastElement.next = nextChild + newChildsLastElement.next_element = nextChild - if newChildsLastElement.next: - newChildsLastElement.next.previous = newChildsLastElement + if newChildsLastElement.next_element: + newChildsLastElement.next_element.previous_element = newChildsLastElement self.contents.insert(position, newChild) def append(self, tag): @@ -223,6 +220,14 @@ class PageElement(object): findParents = find_parents # BS3 fetchParents = find_parents # BS2 + @property + def next(self): + return self.next_element + + @property + def previous(self): + return self.previous_element + #These methods do the real heavy lifting. def _find_one(self, method, name, attrs, text, **kwargs): @@ -260,7 +265,7 @@ class PageElement(object): def next_elements(self): i = self while i: - i = i.next + i = i.next_element yield i @property @@ -274,7 +279,7 @@ class PageElement(object): def previous_elements(self): i = self while i: - i = i.previous + i = i.previous_element yield i @property @@ -688,11 +693,11 @@ class Tag(PageElement): def recursive_children(self): if not len(self.contents): raise StopIteration # XXX return instead? - stopNode = self._last_recursive_child().next + stopNode = self._last_recursive_child().next_element current = self.contents[0] while current is not stopNode: yield current - current = current.next + current = current.next_element # Old names for backwards compatibility def childGenerator(self): @@ -733,8 +738,9 @@ class SoupStrainer(object): if isinstance(markupName, Tag): markup = markupName markupAttrs = markup - callFunctionWithTagData = callable(self.name) \ - and not isinstance(markupName, Tag) + callFunctionWithTagData = ( + isinstance(self.name, collections.Callable) + and not isinstance(markupName, Tag)) if (not self.name) \ or callFunctionWithTagData \ @@ -795,7 +801,7 @@ class SoupStrainer(object): result = False if matchAgainst == True and type(matchAgainst) == types.BooleanType: result = markup != None - elif callable(matchAgainst): + elif isinstance(matchAgainst, collections.Callable): result = matchAgainst(markup) else: #Custom match methods take the tag as an argument, but all diff --git a/tests/test_tree.py b/tests/test_tree.py index f2989fe..87a7e3a 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -307,16 +307,16 @@ class TestNextOperations(ProximityTest): self.start = self.tree.b def test_next(self): - self.assertEquals(self.start.next, "One") - self.assertEquals(self.start.next.next['id'], "2") + self.assertEquals(self.start.next_element, "One") + self.assertEquals(self.start.next_element.next_element['id'], "2") def test_next_of_last_item_is_none(self): last = self.tree.find(text="Three") - self.assertEquals(last.next, None) + self.assertEquals(last.next_element, None) def test_next_of_root_is_none(self): # The document root is outside the next/previous chain. - self.assertEquals(self.tree.next, None) + self.assertEquals(self.tree.next_element, None) def test_find_all_next(self): self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"]) @@ -352,17 +352,17 @@ class TestPreviousOperations(ProximityTest): self.end = self.tree.find(text="Three") def test_previous(self): - self.assertEquals(self.end.previous['id'], "3") - self.assertEquals(self.end.previous.previous, "Two") + self.assertEquals(self.end.previous_element['id'], "3") + self.assertEquals(self.end.previous_element.previous_element, "Two") def test_previous_of_first_item_is_none(self): first = self.tree.find('html') - self.assertEquals(first.previous, None) + self.assertEquals(first.previous_element, None) def test_previous_of_root_is_none(self): # The document root is outside the next/previous chain. # XXX This is broken! - #self.assertEquals(self.tree.previous, None) + #self.assertEquals(self.tree.previous_element, None) pass def test_find_all_previous(self): @@ -436,7 +436,7 @@ class TestNextSibling(SiblingTest): self.assertEquals(self.start.nextSibling.nextSibling['id'], '3') # Note the difference between nextSibling and next. - self.assertEquals(self.start.next['id'], '1.1') + self.assertEquals(self.start.next_element['id'], '1.1') def test_next_sibling_may_not_exist(self): self.assertEquals(self.tree.html.nextSibling, None) @@ -481,7 +481,7 @@ class TestPreviousSibling(SiblingTest): self.assertEquals(self.end.previousSibling.previousSibling['id'], '2') # Note the difference between previousSibling and previous. - self.assertEquals(self.end.previous['id'], '3.1') + self.assertEquals(self.end.previous_element['id'], '3.1') def test_previous_sibling_may_not_exist(self): self.assertEquals(self.tree.html.previousSibling, None) @@ -565,10 +565,10 @@ class TestTreeModification(SoupTest): soup.find(text="Argh!").replace_with("Hooray!") new_text = soup.find(text="Hooray!") b = soup.b - self.assertEqual(new_text.previous, b) + self.assertEqual(new_text.previous_element, b) self.assertEqual(new_text.parent, b) - self.assertEqual(new_text.previous.next, new_text) - self.assertEqual(new_text.next, None) + self.assertEqual(new_text.previous_element.next_element, new_text) + self.assertEqual(new_text.next_element, None) def test_consecutive_text_nodes(self): # A builder should never create two consecutive text nodes, @@ -582,14 +582,14 @@ class TestTreeModification(SoupTest): "<a><b>Argh!Hooray!</b><c></c></a>")) new_text = soup.find(text="Hooray!") - self.assertEqual(new_text.previous, "Argh!") - self.assertEqual(new_text.previous.next, new_text) + self.assertEqual(new_text.previous_element, "Argh!") + self.assertEqual(new_text.previous_element.next_element, new_text) self.assertEqual(new_text.previousSibling, "Argh!") self.assertEqual(new_text.previousSibling.nextSibling, new_text) self.assertEqual(new_text.nextSibling, None) - self.assertEqual(new_text.next, soup.c) + self.assertEqual(new_text.next_element, soup.c) def test_insert_tag(self): @@ -610,8 +610,8 @@ class TestTreeModification(SoupTest): self.assertEqual(magic_tag.previousSibling, b_tag) find = b_tag.find(text="Find") - self.assertEqual(find.next, magic_tag) - self.assertEqual(magic_tag.previous, find) + self.assertEqual(find.next_element, magic_tag) + self.assertEqual(magic_tag.previous_element, find) c_tag = soup.c self.assertEqual(magic_tag.nextSibling, c_tag) @@ -619,8 +619,8 @@ class TestTreeModification(SoupTest): the = magic_tag.find(text="the") self.assertEqual(the.parent, magic_tag) - self.assertEqual(the.next, c_tag) - self.assertEqual(c_tag.previous, the) + self.assertEqual(the.next_element, c_tag) + self.assertEqual(c_tag.previous_element, the) def test_insert_works_on_empty_element_tag(self): # This is a little strange, since most HTML parsers don't allow @@ -643,7 +643,7 @@ class TestTreeModification(SoupTest): self.assertEquals(show.parent, None) self.assertEquals(no.parent, soup.p) - self.assertEquals(no.next, "no") + self.assertEquals(no.next_element, "no") self.assertEquals(no.nextSibling, " business") def test_nested_tag_replace_with(self): @@ -662,24 +662,24 @@ class TestTreeModification(SoupTest): # The <b> tag is now an orphan. self.assertEqual(remove_tag.parent, None) - self.assertEqual(remove_tag.find(text="right").next, None) - self.assertEqual(remove_tag.previous, None) + self.assertEqual(remove_tag.find(text="right").next_element, None) + self.assertEqual(remove_tag.previous_element, None) self.assertEqual(remove_tag.nextSibling, None) self.assertEqual(remove_tag.previousSibling, None) # The <f> tag is now connected to the <a> tag. self.assertEqual(move_tag.parent, soup.a) - self.assertEqual(move_tag.previous, "We") - self.assertEqual(move_tag.next.next, soup.e) + self.assertEqual(move_tag.previous_element, "We") + self.assertEqual(move_tag.next_element.next_element, soup.e) self.assertEqual(move_tag.nextSibling, None) # The gap where the <f> tag used to be has been mended, and # the word "to" is now connected to the <g> tag. to_text = soup.find(text="to") g_tag = soup.g - self.assertEqual(to_text.next, g_tag) + self.assertEqual(to_text.next_element, g_tag) self.assertEqual(to_text.nextSibling, g_tag) - self.assertEqual(g_tag.previous, to_text) + self.assertEqual(g_tag.previous_element, to_text) self.assertEqual(g_tag.previousSibling, to_text) def test_extract(self): @@ -696,15 +696,15 @@ class TestTreeModification(SoupTest): # The extracted tag is now an orphan. self.assertEqual(len(soup.body.contents), 2) self.assertEqual(extracted.parent, None) - self.assertEqual(extracted.previous, None) - self.assertEqual(extracted.next.next, None) + self.assertEqual(extracted.previous_element, None) + self.assertEqual(extracted.next_element.next_element, None) # The gap where the extracted tag used to be has been mended. content_1 = soup.find(text="Some content. ") content_2 = soup.find(text=" More content.") - self.assertEquals(content_1.next, content_2) + self.assertEquals(content_1.next_element, content_2) self.assertEquals(content_1.nextSibling, content_2) - self.assertEquals(content_2.previous, content_1) + self.assertEquals(content_2.previous_element, content_1) self.assertEquals(content_2.previousSibling, content_1) |