summaryrefslogtreecommitdiff
path: root/bs4/element.py
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/element.py')
-rw-r--r--bs4/element.py136
1 files changed, 101 insertions, 35 deletions
diff --git a/bs4/element.py b/bs4/element.py
index 6fb6210..ffe13c5 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -11,6 +11,11 @@ from util import isList
DEFAULT_OUTPUT_ENCODING = "utf-8"
+def _match_css_class(str):
+ """Build a RE to match the given CSS class."""
+ return re.compile(r"(^|.*\s)%s($|\s)" % str)
+
+
class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
@@ -29,10 +34,10 @@ class PageElement(object):
def replace_with(self, replace_with):
oldParent = self.parent
- myIndex = self.parent.contents.index(self)
- if hasattr(replace_with, 'parent') and replace_with.parent == self.parent:
+ myIndex = self.parent.index(self)
+ if hasattr(replace_with, 'parent') and replace_with.parent is self.parent:
# We're replacing this element with one of its siblings.
- index = self.parent.contents.index(replace_with)
+ index = self.parent.index(replace_with)
if index and index < myIndex:
# Furthermore, it comes before this element. That
# means that when we extract it, the index of this
@@ -40,15 +45,20 @@ class PageElement(object):
myIndex = myIndex - 1
self.extract()
oldParent.insert(myIndex, replace_with)
- replaceWith = replace_with # BS4
+ replaceWith = replace_with # BS3
+
+ def replace_with_children(self):
+ my_parent = self.parent
+ my_index = self.parent.index(self)
+ self.extract()
+ for child in reversed(self.contents[:]):
+ my_parent.insert(my_index, child)
+ replaceWithChildren = replace_with_children
def extract(self):
"""Destructively rips this element out of the tree."""
if self.parent:
- try:
- self.parent.contents.remove(self)
- except ValueError:
- pass
+ del self.parent.contents[self.parent.index(self)]
#Find the two elements that would be next to each other if
#this element (and any children) hadn't been parsed. Connect
@@ -80,22 +90,20 @@ class PageElement(object):
def insert(self, position, newChild):
if (isinstance(newChild, basestring)
- or isinstance(newChild, unicode)) \
- and not isinstance(newChild, NavigableString):
+ and not isinstance(newChild, NavigableString)):
newChild = NavigableString(newChild)
position = min(position, len(self.contents))
- if hasattr(newChild, 'parent') and newChild.parent != None:
+ if hasattr(newChild, 'parent') and newChild.parent is not None:
# We're 'inserting' an element that's already one
# of this object's children.
- if newChild.parent == self:
- index = self.find(newChild)
- if index and index < position:
+ if newChild.parent is self:
+ if self.index(newChild) > position:
# Furthermore we're moving it further down the
# list of this object's children. That means that
# when we extract this element, our target index
# will jump down one.
- position = position - 1
+ position -= 1
newChild.extract()
newChild.parent = self
@@ -239,6 +247,17 @@ class PageElement(object):
if isinstance(name, SoupStrainer):
strainer = name
+ elif text is None and not limit and not attrs and not kwargs:
+ # findAll*(True)
+ if name is True or name is None:
+ return [element for element in generator
+ if isinstance(element, Tag)]
+ # findAll*('tag-name')
+ elif isinstance(name, basestring):
+ return [element for element in generator
+ if isinstance(element, Tag) and element.name == name]
+ else:
+ strainer = SoupStrainer(name, attrs, text, **kwargs)
else:
# Build a SoupStrainer
strainer = SoupStrainer(name, attrs, text, **kwargs)
@@ -261,35 +280,35 @@ class PageElement(object):
@property
def next_elements(self):
i = self
- while i:
+ while i is not None:
i = i.next
yield i
@property
def next_siblings(self):
i = self
- while i:
+ while i is not None:
i = i.nextSibling
yield i
@property
def previous_elements(self):
i = self
- while i:
+ while i is not None:
i = i.previous
yield i
@property
def previous_siblings(self):
i = self
- while i:
+ while i is not None:
i = i.previousSibling
yield i
@property
def parents(self):
i = self
- while i:
+ while i is not None:
i = i.parent
yield i
@@ -404,7 +423,7 @@ class Tag(PageElement):
# chunks be garbage-collected.
self.parserClass = parser.__class__
self.name = name
- if attrs == None:
+ if attrs is None:
attrs = {}
else:
attrs = dict(attrs)
@@ -454,6 +473,60 @@ class Tag(PageElement):
return child
return child.string
+ @string.setter
+ def string(self, string):
+ self.clear()
+ self.append(string)
+
+ def get_text(self, separator=u"", strip=False):
+ """
+ Get all child strings, concatenated using the given separator
+ """
+ if strip:
+ return separator.join(string.strip()
+ for string in self.recursive_children
+ if isinstance(string, NavigableString) and string.strip())
+ else:
+ return separator.join(string
+ for string in self.recursive_children
+ if isinstance(string, NavigableString))
+ getText = get_text
+
+ text = property(get_text)
+
+ def decompose(self):
+ """Recursively destroys the contents of this tree."""
+ self.extract()
+ i = self
+ while i is not None:
+ next = i.next
+ i.__dict__.clear()
+ i = next
+
+ def clear(self, decompose=False):
+ """
+ Extract all children. If decompose is True, decompose instead.
+ """
+ if decompose:
+ for element in self.contents[:]:
+ if isinstance(element, Tag):
+ element.decompose()
+ else:
+ element.extract()
+ else:
+ for element in self.contents[:]:
+ element.extract()
+
+ def index(self, element):
+ """
+ Find the index of a child by identity, not value. Avoids issues with
+ tag.contents.index(element) getting the index of equal elements.
+ """
+ for i, child in enumerate(self.contents):
+ if child is element:
+ return i
+ raise ValueError("Tag.index: element not in tag")
+
def get(self, key, default=None):
"""Returns the value of the 'key' attribute for the tag, or
the value given for 'default' if it doesn't have that
@@ -510,6 +583,8 @@ class Tag(PageElement):
def __eq__(self, other):
"""Returns true iff this tag has the same name, the same attributes,
and the same contents (recursively) as the given tag."""
+ if self is other:
+ return True
if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
return False
for i in range(0, len(self.contents)):
@@ -606,16 +681,6 @@ class Tag(PageElement):
s = ''.join(s)
return s
- def decompose(self):
- """Recursively destroys the contents of this tree."""
- contents = [i for i in self.contents]
- for i in contents:
- if isinstance(i, Tag):
- i.decompose()
- else:
- i.extract()
- self.extract()
-
def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
return self.encode(encoding, True)
@@ -684,12 +749,13 @@ class Tag(PageElement):
#Generator methods
@property
def children(self):
+ # return iter() to make the purpose of the method clear
return iter(self.contents) # XXX This seems to be untested.
@property
def recursive_children(self):
if not len(self.contents):
- raise StopIteration # XXX return instead?
+ return
stopNode = self._last_recursive_child().next
current = self.contents[0]
while current is not stopNode:
@@ -712,7 +778,7 @@ class SoupStrainer(object):
def __init__(self, name=None, attrs={}, text=None, **kwargs):
self.name = name
if isinstance(attrs, basestring):
- kwargs['class'] = attrs
+ kwargs['class'] = _match_css_class(attrs)
attrs = None
if kwargs:
if attrs:
@@ -795,8 +861,8 @@ class SoupStrainer(object):
def _matches(self, markup, matchAgainst):
#print "Matching %s against %s" % (markup, matchAgainst)
result = False
- if matchAgainst == True and type(matchAgainst) == types.BooleanType:
- result = markup != None
+ if matchAgainst is True:
+ result = markup is not None
elif callable(matchAgainst):
result = matchAgainst(markup)
else: