summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/beautifulsoup/__init__.py14
-rw-r--r--src/beautifulsoup/tests/test_tree.py189
2 files changed, 194 insertions, 9 deletions
diff --git a/src/beautifulsoup/__init__.py b/src/beautifulsoup/__init__.py
index e4a8ca4..79bb657 100644
--- a/src/beautifulsoup/__init__.py
+++ b/src/beautifulsoup/__init__.py
@@ -129,7 +129,8 @@ class BeautifulStoneSoup(Tag):
# alone.
STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
- def _defaultBuilder(self):
+ @classmethod
+ def default_builder(self):
from lxml import etree
from builder.lxml_builder import LXMLTreeBuilder
return LXMLTreeBuilder(parser_class=etree.XMLParser)
@@ -141,7 +142,7 @@ class BeautifulStoneSoup(Tag):
is fed into the underlying parser."""
if builder is None:
- builder = self._defaultBuilder()
+ builder = self.default_builder()
self.builder = builder
self.builder.soup = self
@@ -343,7 +344,9 @@ class BeautifulStoneSoup(Tag):
class BeautifulSoup(BeautifulStoneSoup):
"""A convenience class for parsing HTML without creating a builder."""
- def _defaultBuilder(self):
+
+ @classmethod
+ def default_builder(self):
try:
from builder.html5_builder import HTML5TreeBuilder
return HTML5TreeBuilder()
@@ -356,11 +359,6 @@ class StopParsing(Exception):
pass
-class ICantBelieveItsBeautifulSoup(BeautifulStoneSoup):
- def _defaultBuilder(self):
- return ICantBelieveItsValidHTMLBuilder()
-
-
#By default, act as an HTML pretty-printer.
if __name__ == '__main__':
import sys
diff --git a/src/beautifulsoup/tests/test_tree.py b/src/beautifulsoup/tests/test_tree.py
index 31d5dc2..42430d3 100644
--- a/src/beautifulsoup/tests/test_tree.py
+++ b/src/beautifulsoup/tests/test_tree.py
@@ -10,7 +10,8 @@ methods tested here.
"""
import re
-from beautifulsoup.element import SoupStrainer
+from beautifulsoup import BeautifulSoup
+from beautifulsoup.element import SoupStrainer, Tag
from helpers import SoupTest
class TreeTest(SoupTest):
@@ -508,6 +509,192 @@ class TestPreviousSibling(SiblingTest):
self.assertEquals(start.findPreviousSibling(text="nonesuch"), None)
+class TestTreeModification(SoupTest):
+
+ def test_attribute_modification(self):
+ soup = self.soup('<a id="1"></a>')
+ soup.a['id'] = 2
+ self.assertEqual(soup.decode(), self.document_for('<a id="2"></a>'))
+ del(soup.a['id'])
+ self.assertEqual(soup.decode(), self.document_for('<a></a>'))
+ soup.a['id2'] = 'foo'
+ self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>'))
+
+ def test_new_tag_creation(self):
+ builder = BeautifulSoup.default_builder()
+ soup = BeautifulSoup("", builder=builder)
+ a = Tag(soup, builder, 'a')
+ ol = Tag(soup, builder, 'ol')
+ a['href'] = 'http://foo.com/'
+ soup.insert(0, a)
+ soup.insert(1, ol)
+ self.assertEqual(
+ soup.decode(), '<a href="http://foo.com/"></a><ol></ol>')
+
+ def test_append_to_contents_moves_tag(self):
+ doc = """<p id="1">Don't leave me <b>here</b>.</p>
+ <p id="2">Don\'t leave!</p>"""
+ soup = self.soup(doc)
+ second_para = soup.find(id='2')
+ bold = soup.b
+
+ # Move the <b> tag to the end of the second paragraph.
+ soup.find(id='2').append(soup.b)
+
+ # The <b> tag is now a child of the second paragraph.
+ self.assertEqual(bold.parent, second_para)
+
+ self.assertEqual(
+ soup.decode(), self.document_for(
+ '<p id="1">Don\'t leave me .</p>\n'
+ '<p id="2">Don\'t leave!<b>here</b></p>'))
+
+ def test_replace_tag_with_itself(self):
+ text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
+ soup = BeautifulSoup(text)
+ c = soup.c
+ soup.c.replaceWith(c)
+ self.assertEquals(soup.decode(), self.document_for(text))
+
+ def test_replace_final_node(self):
+ soup = self.soup("<b>Argh!</b>")
+ soup.find(text="Argh!").replaceWith("Hooray!")
+ new_text = soup.find(text="Hooray!")
+ b = soup.b
+ self.assertEqual(new_text.previous, b)
+ self.assertEqual(new_text.parent, b)
+ self.assertEqual(new_text.previous.next, new_text)
+ self.assertEqual(new_text.next, None)
+
+ def test_consecutive_text_nodes(self):
+ # A builder should never create two consecutive text nodes,
+ # but if you insert one next to another, Beautiful Soup will
+ # handle it correctly.
+ soup = self.soup("<a><b>Argh!</b><c></c></a>")
+ soup.b.insert(1, "Hooray!")
+
+ self.assertEqual(
+ soup.decode(), self.document_for(
+ "<a><b>Argh!Hooray!</b><c></c></a>"))
+
+ new_text = soup.find(text="Hooray!")
+ self.assertEqual(new_text.previous, "Argh!")
+ self.assertEqual(new_text.previous.next, new_text)
+
+ self.assertEqual(new_text.previousSibling, "Argh!")
+ self.assertEqual(new_text.previousSibling.nextSibling, new_text)
+
+ self.assertEqual(new_text.nextSibling, None)
+ self.assertEqual(new_text.next, soup.c)
+
+
+ def test_insert_tag(self):
+ builder = self.default_builder
+ soup = BeautifulSoup(
+ "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
+ magic_tag = Tag(soup, builder, 'magictag')
+ magic_tag.insert(0, "the")
+ soup.a.insert(1, magic_tag)
+
+ self.assertEqual(
+ soup.decode(), self.document_for(
+ "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>"))
+
+ # Make sure all the relationships are hooked up correctly.
+ b_tag = soup.b
+ self.assertEqual(b_tag.nextSibling, magic_tag)
+ self.assertEqual(magic_tag.previousSibling, b_tag)
+
+ find = b_tag.find(text="Find")
+ self.assertEqual(find.next, magic_tag)
+ self.assertEqual(magic_tag.previous, find)
+
+ c_tag = soup.c
+ self.assertEqual(magic_tag.nextSibling, c_tag)
+ self.assertEqual(c_tag.previousSibling, magic_tag)
+
+ the = magic_tag.find(text="the")
+ self.assertEqual(the.parent, magic_tag)
+ self.assertEqual(the.next, c_tag)
+ self.assertEqual(c_tag.previous, the)
+
+ def test_replace_with(self):
+ soup = self.soup(
+ "<p>There's <b>no</b> business like <b>show</b> business</p>")
+ no, show = soup.findAll('b')
+ show.replaceWith(no)
+ self.assertEquals(
+ soup.decode(),
+ self.document_for(
+ "<p>There's business like <b>no</b> business</p>"))
+
+ self.assertEquals(show.parent, None)
+ self.assertEquals(no.parent, soup.p)
+ self.assertEquals(no.next, "no")
+ self.assertEquals(no.nextSibling, " business")
+
+ def test_nested_tag_replace_with(self):
+ soup = BeautifulSoup(
+ """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
+
+ # Replace the entire <b> tag and its contents ("reserve the
+ # right") with the <f> tag ("refuse").
+ remove_tag = soup.b
+ move_tag = soup.f
+ remove_tag.replaceWith(move_tag)
+
+ self.assertEqual(
+ soup.decode(), self.document_for(
+ "<a>We<f>refuse</f></a><e>to<g>service</g></e>"))
+
+ # The <b> tag is now an orphan.
+ self.assertEqual(remove_tag.parent, None)
+ self.assertEqual(remove_tag.find(text="right").next, None)
+ self.assertEqual(remove_tag.previous, None)
+ self.assertEqual(remove_tag.nextSibling, None)
+ self.assertEqual(remove_tag.previousSibling, None)
+
+ # The <f> tag is now connected to the <a> tag.
+ self.assertEqual(move_tag.parent, soup.a)
+ self.assertEqual(move_tag.previous, "We")
+ self.assertEqual(move_tag.next.next, soup.e)
+ self.assertEqual(move_tag.nextSibling, None)
+
+ # The gap where the <f> tag used to be has been mended, and
+ # the word "to" is now connected to the <g> tag.
+ to_text = soup.find(text="to")
+ g_tag = soup.g
+ self.assertEqual(to_text.next, g_tag)
+ self.assertEqual(to_text.nextSibling, g_tag)
+ self.assertEqual(g_tag.previous, to_text)
+ self.assertEqual(g_tag.previousSibling, to_text)
+
+ def test_extract(self):
+ soup = self.soup(
+ '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>')
+
+ self.assertEqual(len(soup.body.contents), 3)
+ extracted = soup.find(id="nav").extract()
+
+ self.assertEqual(
+ soup.decode(), "<html><body>Some content. More content.</body></html>")
+ self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>')
+
+ # The extracted tag is now an orphan.
+ self.assertEqual(len(soup.body.contents), 2)
+ self.assertEqual(extracted.parent, None)
+ self.assertEqual(extracted.previous, None)
+ self.assertEqual(extracted.next.next, None)
+
+ # The gap where the extracted tag used to be has been mended.
+ content_1 = soup.find(text="Some content. ")
+ content_2 = soup.find(text=" More content.")
+ self.assertEquals(content_1.next, content_2)
+ self.assertEquals(content_1.nextSibling, content_2)
+ self.assertEquals(content_2.previous, content_1)
+ self.assertEquals(content_2.previousSibling, content_1)
+
+
class TestElementObjects(SoupTest):
"""Test various features of element objects."""