summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bs4/__init__.py4
-rw-r--r--bs4/element.py49
-rw-r--r--bs4/tests/test_tree.py11
3 files changed, 44 insertions, 20 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 07795b9..fbe2914 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -193,6 +193,10 @@ class BeautifulSoup(Tag):
self.tagStack = []
self.pushTag(self)
+ def new_tag(self, name, **attrs):
+ """Create a new tag associated with this soup."""
+ return Tag(None, None, name, attrs)
+
def popTag(self):
tag = self.tagStack.pop()
#print "Pop", tag.name
diff --git a/bs4/element.py b/bs4/element.py
index 0ba2bdc..9344f45 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -76,7 +76,7 @@ class PageElement(object):
#Find the two elements that would be next to each other if
#this element (and any children) hadn't been parsed. Connect
#the two.
- last_child = self._last_recursive_child()
+ last_child = self._last_descendant()
next_element = last_child.next_element
if self.previous_element:
@@ -94,14 +94,14 @@ class PageElement(object):
self.previous_sibling = self.next_sibling = None
return self
- def _last_recursive_child(self):
+ def _last_descendant(self):
"Finds the last element beneath this object to be parsed."
last_child = self
while hasattr(last_child, 'contents') and last_child.contents:
last_child = last_child.contents[-1]
return last_child
# BS3: Not part of the API!
- _lastRecursiveChild = _last_recursive_child
+ _lastRecursiveChild = _last_descendant
def insert(self, position, new_child):
if (isinstance(new_child, basestring)
@@ -130,11 +130,11 @@ class PageElement(object):
previous_child = self.contents[position - 1]
new_child.previous_sibling = previous_child
new_child.previous_sibling.next_sibling = new_child
- new_child.previous_element = previous_child._last_recursive_child()
+ new_child.previous_element = previous_child._last_descendant()
if new_child.previous:
new_child.previous_element.next_element = new_child
- new_childs_last_element = new_child._last_recursive_child()
+ new_childs_last_element = new_child._last_descendant()
if position >= len(self.contents):
new_child.next_sibling = None
@@ -504,20 +504,29 @@ class Tag(PageElement):
self.clear()
self.append(string)
- def get_text(self, separator=u"", strip=False):
+ def _all_strings(self, strip=False):
+ """Yield all child strings, possibly stripping them."""
+ for descendant in self.descendants:
+ if not isinstance(descendant, NavigableString):
+ continue
+ if strip:
+ descendant = descendant.strip()
+ if len(descendant) == 0:
+ continue
+ yield descendant
+ strings = property(_all_strings)
+
+ @property
+ def stripped_strings(self):
+ for string in self._all_strings(True):
+ yield string
+
+ def get_text(self, separator="", strip=False):
"""
- Get all child strings, concatenated using the given separator
+ Get all child strings, concatenated using the given separator.
"""
- if strip:
- return separator.join(string.strip()
- for string in self.recursive_children
- if isinstance(string, NavigableString) and string.strip())
- else:
- return separator.join(string
- for string in self.recursive_children
- if isinstance(string, NavigableString))
+ return separator.join([s for s in self._all_strings(strip)])
getText = get_text
-
text = property(get_text)
def decompose(self):
@@ -774,7 +783,7 @@ class Tag(PageElement):
callable that takes a string and returns whether or not the
string matches for some custom definition of 'matches'. The
same is true of the tag name."""
- generator = self.recursive_children
+ generator = self.descendants
if not recursive:
generator = self.children
return self._find_all(name, attrs, text, limit, generator, **kwargs)
@@ -788,10 +797,10 @@ class Tag(PageElement):
return iter(self.contents) # XXX This seems to be untested.
@property
- def recursive_children(self):
+ def descendants(self):
if not len(self.contents):
return
- stopNode = self._last_recursive_child().next_element
+ stopNode = self._last_descendant().next_element
current = self.contents[0]
while current is not stopNode:
yield current
@@ -802,7 +811,7 @@ class Tag(PageElement):
return self.children
def recursiveChildGenerator(self):
- return self.recursive_children
+ return self.descendants
# This was kind of misleading because has_key() (attributes) was
# different from __in__ (contents). has_key() is gone in Python 3,
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index f8a55e0..ee52edf 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -534,6 +534,17 @@ class TestPreviousSibling(SiblingTest):
self.assertEqual(start.find_previous_sibling(text="nonesuch"), None)
+class TestTagCreation(SoupTest):
+ """Test the ability to create new tags."""
+ def test_new_tag(self):
+ soup = self.soup("")
+ new_tag = soup.new_tag("foo", bar="baz")
+ self.assertTrue(isinstance(new_tag, Tag))
+ self.assertEqual("foo", new_tag)
+ self.assertEqual(dict(bar="baz"), new_tag.attrs)
+ self.assertEqual(None, new_tag.parent)
+
+
class TestTreeModification(SoupTest):
def test_attribute_modification(self):