summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG9
-rw-r--r--TODO11
-rw-r--r--bs4/__init__.py11
-rw-r--r--bs4/dammit.py91
-rw-r--r--bs4/element.py435
-rw-r--r--bs4/testing.py6
-rw-r--r--bs4/util.py23
-rw-r--r--tests/test_tree.py153
8 files changed, 454 insertions, 285 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 00d80da..1e8b449 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -19,10 +19,11 @@ version of the API is in use, the module is now called 'bs4':
== Better method names ==
-Methods have been renamed to comply with PEP 8. The old names still
-work. Here are the renames:
+Methods and attributes have been renamed to comply with PEP 8. The old names
+still work. Here are the renames:
* replaceWith -> replace_with
+ * replaceWithChildren -> replace_with_children
* findAll -> find_all
* findAllNext -> find_all_next
* findAllPrevious -> find_all_previous
@@ -34,6 +35,8 @@ work. Here are the renames:
* findPrevious -> find_previous
* findPreviousSibling -> find_previous_sibling
* findPreviousSiblings -> find_previous_siblings
+ * nextSibling -> next_sibling
+ * previousSibling -> previous_sibling
Some attributes have also been renamed:
@@ -159,7 +162,7 @@ A later version of Beautiful Soup will allow you to plug in different
parsers to make tradeoffs between speed and the ability to handle bad
HTML.
-3. In Python 3 (but not Python 2),HTMLParser converts entities within
+3. In Python 3 (but not Python 2), HTMLParser converts entities within
attributes to the corresponding Unicode characters. In Python 2 it's
possible to parse this string and leave the é intact.
diff --git a/TODO b/TODO
index 2606566..060dc13 100644
--- a/TODO
+++ b/TODO
@@ -1,10 +1,7 @@
-3.0.8 optimized findAll('tag-name') and findAll(True), bypassing the
-soupstrainer process.
-
-define a setter for Tag.string which replaces the tag's contents with
-a string.
-
-Tag.text: concatenate all strings and return them
+if len(tag) > 3 and tag.endswith('Tag'): -> endswith('_tag')
+markup_attr_map can be optimized since it's always a map now.
+Can we get rid of isList?
+Split self.assertRaises(ValueError, tree.index, 1) into a separate test
Bare ampersands should be converted to HTML entities upon output.
diff --git a/bs4/__init__.py b/bs4/__init__.py
index e0eba75..66a1c02 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -65,7 +65,7 @@ class BeautifulSoup(Tag):
# can be replaced with a single space. A text node that contains
# fancy Unicode spaces (usually non-breaking) should be left
# alone.
- STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
+ STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None):
@@ -167,7 +167,6 @@ class BeautifulSoup(Tag):
self.previous = o
self.currentTag.contents.append(o)
-
def _popToTag(self, name, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
instance of the given tag. If inclusivePop is false, pops the tag
@@ -179,9 +178,10 @@ class BeautifulSoup(Tag):
numPops = 0
mostRecentTag = None
- for i in range(len(self.tagStack)-1, 0, -1):
+
+ for i in range(len(self.tagStack) - 1, 0, -1):
if name == self.tagStack[i].name:
- numPops = len(self.tagStack)-i
+ numPops = len(self.tagStack) - i
break
if not inclusivePop:
numPops = numPops - 1
@@ -204,7 +204,7 @@ class BeautifulSoup(Tag):
if (self.parse_only and len(self.tagStack) <= 1
and (self.parse_only.text
- or not self.parse_only.searchTag(name, attrs))):
+ or not self.parse_only.search_tag(name, attrs))):
return None
tag = Tag(self, self.builder, name, attrs, self.currentTag,
@@ -217,7 +217,6 @@ class BeautifulSoup(Tag):
self.pushTag(tag)
return tag
-
def handle_endtag(self, name):
#print "End tag: " + name
self.endData()
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 75d445e..f3e770e 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -42,7 +42,7 @@ class EntitySubstitution(object):
# There's no point in turning the quotation mark into
# &quot;, unless it happens within an attribute value, which
# is handled elsewhere.
- continue;
+ continue
character = unichr(codepoint)
characters.append(character)
lookup[character] = name
@@ -52,13 +52,12 @@ class EntitySubstitution(object):
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
-
CHARACTER_TO_XML_ENTITY = {
- "'" : "apos",
- '"' : "quot",
- "&" : "amp",
- "<" : "lt",
- ">" : "gt",
+ "'": "apos",
+ '"': "quot",
+ "&": "amp",
+ "<": "lt",
+ ">": "gt",
}
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
@@ -157,8 +156,8 @@ class UnicodeDammit:
# meta tags to the corresponding Python codec names. It only covers
# values that aren't in Python's aliases and can't be determined
# by the heuristics in find_codec.
- CHARSET_ALIASES = { "macintosh" : "mac-roman",
- "x-sjis" : "shift-jis" }
+ CHARSET_ALIASES = {"macintosh": "mac-roman",
+ "x-sjis": "shift-jis"}
ENCODINGS_WITH_SMART_QUOTES = [
"windows-1252",
@@ -198,7 +197,8 @@ class UnicodeDammit:
break
self.unicode = u
- if not u: self.original_encoding = None
+ if not u:
+ self.original_encoding = None
def _sub_ms_char(self, match):
"""Changes a MS smart quote character to an XML or HTML
@@ -335,7 +335,6 @@ class UnicodeDammit:
xml_encoding = sniffed_xml_encoding
return xml_data, xml_encoding, sniffed_xml_encoding
-
def find_codec(self, charset):
return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
or (charset and self._codec(charset.replace("-", ""))) \
@@ -343,7 +342,8 @@ class UnicodeDammit:
or charset
def _codec(self, charset):
- if not charset: return charset
+ if not charset:
+ return charset
codec = None
try:
codecs.lookup(charset)
@@ -353,6 +353,7 @@ class UnicodeDammit:
return codec
EBCDIC_TO_ASCII_MAP = None
+
def _ebcdic_to_ascii(self, s):
c = self.__class__
if not c.EBCDIC_TO_ASCII_MAP:
@@ -374,39 +375,39 @@ class UnicodeDammit:
90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
250,251,252,253,254,255)
import string
- c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
+ c.EBCDIC_TO_ASCII_MAP = string.maketrans(
''.join(map(chr, range(256))), ''.join(map(chr, emap)))
return s.translate(c.EBCDIC_TO_ASCII_MAP)
- MS_CHARS = { '\x80' : ('euro', '20AC'),
- '\x81' : ' ',
- '\x82' : ('sbquo', '201A'),
- '\x83' : ('fnof', '192'),
- '\x84' : ('bdquo', '201E'),
- '\x85' : ('hellip', '2026'),
- '\x86' : ('dagger', '2020'),
- '\x87' : ('Dagger', '2021'),
- '\x88' : ('circ', '2C6'),
- '\x89' : ('permil', '2030'),
- '\x8A' : ('Scaron', '160'),
- '\x8B' : ('lsaquo', '2039'),
- '\x8C' : ('OElig', '152'),
- '\x8D' : '?',
- '\x8E' : ('#x17D', '17D'),
- '\x8F' : '?',
- '\x90' : '?',
- '\x91' : ('lsquo', '2018'),
- '\x92' : ('rsquo', '2019'),
- '\x93' : ('ldquo', '201C'),
- '\x94' : ('rdquo', '201D'),
- '\x95' : ('bull', '2022'),
- '\x96' : ('ndash', '2013'),
- '\x97' : ('mdash', '2014'),
- '\x98' : ('tilde', '2DC'),
- '\x99' : ('trade', '2122'),
- '\x9a' : ('scaron', '161'),
- '\x9b' : ('rsaquo', '203A'),
- '\x9c' : ('oelig', '153'),
- '\x9d' : '?',
- '\x9e' : ('#x17E', '17E'),
- '\x9f' : ('Yuml', ''),}
+ MS_CHARS = {'\x80': ('euro', '20AC'),
+ '\x81': ' ',
+ '\x82': ('sbquo', '201A'),
+ '\x83': ('fnof', '192'),
+ '\x84': ('bdquo', '201E'),
+ '\x85': ('hellip', '2026'),
+ '\x86': ('dagger', '2020'),
+ '\x87': ('Dagger', '2021'),
+ '\x88': ('circ', '2C6'),
+ '\x89': ('permil', '2030'),
+ '\x8A': ('Scaron', '160'),
+ '\x8B': ('lsaquo', '2039'),
+ '\x8C': ('OElig', '152'),
+ '\x8D': '?',
+ '\x8E': ('#x17D', '17D'),
+ '\x8F': '?',
+ '\x90': '?',
+ '\x91': ('lsquo', '2018'),
+ '\x92': ('rsquo', '2019'),
+ '\x93': ('ldquo', '201C'),
+ '\x94': ('rdquo', '201D'),
+ '\x95': ('bull', '2022'),
+ '\x96': ('ndash', '2013'),
+ '\x97': ('mdash', '2014'),
+ '\x98': ('tilde', '2DC'),
+ '\x99': ('trade', '2122'),
+ '\x9a': ('scaron', '161'),
+ '\x9b': ('rsaquo', '203A'),
+ '\x9c': ('oelig', '153'),
+ '\x9d': '?',
+ '\x9e': ('#x17E', '17E'),
+ '\x9f': ('Yuml', ''),}
diff --git a/bs4/element.py b/bs4/element.py
index 6fb6210..e141aa8 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -11,6 +11,23 @@ from util import isList
DEFAULT_OUTPUT_ENCODING = "utf-8"
+def _match_css_class(str):
+ """Build a RE to match the given CSS class."""
+ return re.compile(r"(^|.*\s)%s($|\s)" % str)
+
+
+def _alias(attr):
+ """Alias one attribute name to another for backward compatibility"""
+ @property
+ def alias(self):
+ return getattr(self, attr)
+
+ @alias.setter
+ def alias(self):
+ return setattr(self, attr)
+ return alias
+
+
class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
@@ -21,122 +38,132 @@ class PageElement(object):
self.parent = parent
self.previous = previous
self.next = None
- self.previousSibling = None
- self.nextSibling = None
+ self.previous_sibling = None
+ self.next_sibling = None
if self.parent and self.parent.contents:
- self.previousSibling = self.parent.contents[-1]
- self.previousSibling.nextSibling = self
+ self.previous_sibling = self.parent.contents[-1]
+ self.previous_sibling.next_sibling = self
+
+ nextSibling = _alias("next_sibling") # BS3
+ previousSibling = _alias("previous_sibling") # BS3
def replace_with(self, replace_with):
- oldParent = self.parent
- myIndex = self.parent.contents.index(self)
- if hasattr(replace_with, 'parent') and replace_with.parent == self.parent:
+ if replace_with is self:
+ return
+ old_parent = self.parent
+ my_index = self.parent.index(self)
+ if (hasattr(replace_with, 'parent')
+ and replace_with.parent is self.parent):
# We're replacing this element with one of its siblings.
- index = self.parent.contents.index(replace_with)
- if index and index < myIndex:
+ if self.parent.index(replace_with) < my_index:
# Furthermore, it comes before this element. That
# means that when we extract it, the index of this
# element will change.
- myIndex = myIndex - 1
+ my_index -= 1
self.extract()
- oldParent.insert(myIndex, replace_with)
- replaceWith = replace_with # BS4
+ old_parent.insert(my_index, replace_with)
+ replaceWith = replace_with # BS3
+
+ def replace_with_children(self):
+ my_parent = self.parent
+ my_index = self.parent.index(self)
+ self.extract()
+ for child in reversed(self.contents[:]):
+ my_parent.insert(my_index, child)
+ replaceWithChildren = replace_with_children # BS3
def extract(self):
"""Destructively rips this element out of the tree."""
if self.parent:
- try:
- self.parent.contents.remove(self)
- except ValueError:
- pass
+ del self.parent.contents[self.parent.index(self)]
#Find the two elements that would be next to each other if
#this element (and any children) hadn't been parsed. Connect
#the two.
- lastChild = self._last_recursive_child()
- nextElement = lastChild.next
+ last_child = self._last_recursive_child()
+ next_element = last_child.next
if self.previous:
- self.previous.next = nextElement
- if nextElement:
- nextElement.previous = self.previous
+ self.previous.next = next_element
+ if next_element:
+ next_element.previous = self.previous
self.previous = None
- lastChild.next = None
+ last_child.next = None
self.parent = None
- if self.previousSibling:
- self.previousSibling.nextSibling = self.nextSibling
- if self.nextSibling:
- self.nextSibling.previousSibling = self.previousSibling
- self.previousSibling = self.nextSibling = None
+ if self.previous_sibling:
+ self.previous_sibling.next_sibling = self.next_sibling
+ if self.next_sibling:
+ self.next_sibling.previous_sibling = self.previous_sibling
+ self.previous_sibling = self.next_sibling = None
return self
def _last_recursive_child(self):
"Finds the last element beneath this object to be parsed."
- lastChild = self
- while hasattr(lastChild, 'contents') and lastChild.contents:
- lastChild = lastChild.contents[-1]
- return lastChild
-
- def insert(self, position, newChild):
- if (isinstance(newChild, basestring)
- or isinstance(newChild, unicode)) \
- and not isinstance(newChild, NavigableString):
- newChild = NavigableString(newChild)
-
- position = min(position, len(self.contents))
- if hasattr(newChild, 'parent') and newChild.parent != None:
+ last_child = self
+ while hasattr(last_child, 'contents') and last_child.contents:
+ last_child = last_child.contents[-1]
+ return last_child
+ # BS3: Not part of the API!
+ _lastRecursiveChild = _last_recursive_child
+
+ def insert(self, position, new_child):
+ if (isinstance(new_child, basestring)
+ and not isinstance(new_child, NavigableString)):
+ new_child = NavigableString(new_child)
+
+ position = min(position, len(self.contents))
+ if hasattr(new_child, 'parent') and new_child.parent is not None:
# We're 'inserting' an element that's already one
# of this object's children.
- if newChild.parent == self:
- index = self.find(newChild)
- if index and index < position:
+ if new_child.parent is self:
+ if self.index(new_child) > position:
# Furthermore we're moving it further down the
# list of this object's children. That means that
# when we extract this element, our target index
# will jump down one.
- position = position - 1
- newChild.extract()
+ position -= 1
+ new_child.extract()
- newChild.parent = self
- previousChild = None
+ new_child.parent = self
+ previous_child = None
if position == 0:
- newChild.previousSibling = None
- newChild.previous = self
+ new_child.previous_sibling = None
+ new_child.previous = self
else:
- previousChild = self.contents[position-1]
- newChild.previousSibling = previousChild
- newChild.previousSibling.nextSibling = newChild
- newChild.previous = previousChild._last_recursive_child()
- if newChild.previous:
- newChild.previous.next = newChild
+ previous_child = self.contents[position - 1]
+ new_child.previous_sibling = previous_child
+ new_child.previous_sibling.next_sibling = new_child
+ new_child.previous = previous_child._last_recursive_child()
+ if new_child.previous:
+ new_child.previous.next = new_child
- newChildsLastElement = newChild._last_recursive_child()
+ new_childs_last_element = new_child._last_recursive_child()
if position >= len(self.contents):
- newChild.nextSibling = None
+ new_child.next_sibling = None
parent = self
- parentsNextSibling = None
- while not parentsNextSibling:
- parentsNextSibling = parent.nextSibling
+ parents_next_sibling = None
+ while not parents_next_sibling:
+ parents_next_sibling = parent.next_sibling
parent = parent.parent
- if not parent: # This is the last element in the document.
+ if not parent: # This is the last element in the document.
break
- if parentsNextSibling:
- newChildsLastElement.next = parentsNextSibling
+ if parents_next_sibling:
+ new_childs_last_element.next = parents_next_sibling
else:
- newChildsLastElement.next = None
+ new_childs_last_element.next = None
else:
- nextChild = self.contents[position]
- newChild.nextSibling = nextChild
- if newChild.nextSibling:
- newChild.nextSibling.previousSibling = newChild
- newChildsLastElement.next = nextChild
+ next_child = self.contents[position]
+ new_child.next_sibling = next_child
+ if new_child.next_sibling:
+ new_child.next_sibling.previous_sibling = new_child
+ new_childs_last_element.next = next_child
- if newChildsLastElement.next:
- newChildsLastElement.next.previous = newChildsLastElement
- self.contents.insert(position, newChild)
+ if new_childs_last_element.next:
+ new_childs_last_element.next.previous = new_childs_last_element
+ self.contents.insert(position, new_child)
def append(self, tag):
"""Appends the given tag to the contents of this tag."""
@@ -146,7 +173,7 @@ class PageElement(object):
"""Returns the first item that matches the given criteria and
appears after this Tag in the document."""
return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
- findNext = find_next # BS3
+ findNext = find_next # BS3
def find_all_next(self, name=None, attrs={}, text=None, limit=None,
**kwargs):
@@ -154,14 +181,14 @@ class PageElement(object):
after this Tag in the document."""
return self._find_all(name, attrs, text, limit, self.next_elements,
**kwargs)
- findAllNext = find_all_next # BS3
+ findAllNext = find_all_next # BS3
def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the closest sibling to this Tag that matches the
given criteria and appears after this Tag in the document."""
return self._find_one(self.find_next_siblings, name, attrs, text,
**kwargs)
- findNextSibling = find_next_sibling # BS3
+ findNextSibling = find_next_sibling # BS3
def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
**kwargs):
@@ -169,15 +196,15 @@ class PageElement(object):
criteria and appear after this Tag in the document."""
return self._find_all(name, attrs, text, limit,
self.next_siblings, **kwargs)
- findNextSiblings = find_next_siblings # BS3
- fetchNextSiblings = find_next_siblings # BS2
+ findNextSiblings = find_next_siblings # BS3
+ fetchNextSiblings = find_next_siblings # BS2
def find_previous(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the first item that matches the given criteria and
appears before this Tag in the document."""
return self._find_one(
self.find_all_previous, name, attrs, text, **kwargs)
- findPrevious = find_previous # BS3
+ findPrevious = find_previous # BS3
def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
**kwargs):
@@ -185,15 +212,15 @@ class PageElement(object):
before this Tag in the document."""
return self._find_all(name, attrs, text, limit, self.previous_elements,
**kwargs)
- findAllPrevious = find_all_previous # BS3
- fetchPrevious = find_all_previous # BS2
+ findAllPrevious = find_all_previous # BS3
+ fetchPrevious = find_all_previous # BS2
def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the closest sibling to this Tag that matches the
given criteria and appears before this Tag in the document."""
return self._find_one(self.find_previous_siblings, name, attrs, text,
**kwargs)
- findPreviousSibling = find_previous_sibling # BS3
+ findPreviousSibling = find_previous_sibling # BS3
def find_previous_siblings(self, name=None, attrs={}, text=None,
limit=None, **kwargs):
@@ -201,8 +228,8 @@ class PageElement(object):
criteria and appear before this Tag in the document."""
return self._find_all(name, attrs, text, limit,
self.previous_siblings, **kwargs)
- findPreviousSiblings = find_previous_siblings # BS3
- fetchPreviousSiblings = find_previous_siblings # BS2
+ findPreviousSiblings = find_previous_siblings # BS3
+ fetchPreviousSiblings = find_previous_siblings # BS2
def find_parent(self, name=None, attrs={}, **kwargs):
"""Returns the closest parent of this Tag that matches the given
@@ -214,7 +241,7 @@ class PageElement(object):
if l:
r = l[0]
return r
- findParent = find_parent # BS3
+ findParent = find_parent # BS3
def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
"""Returns the parents of this Tag that match the given
@@ -222,8 +249,8 @@ class PageElement(object):
return self._find_all(name, attrs, None, limit, self.parents,
**kwargs)
- findParents = find_parents # BS3
- fetchParents = find_parents # BS2
+ findParents = find_parents # BS3
+ fetchParents = find_parents # BS2
#These methods do the real heavy lifting.
@@ -239,6 +266,17 @@ class PageElement(object):
if isinstance(name, SoupStrainer):
strainer = name
+ elif text is None and not limit and not attrs and not kwargs:
+ # findAll*(True)
+ if name is True or name is None:
+ return [element for element in generator
+ if isinstance(element, Tag)]
+ # findAll*('tag-name')
+ elif isinstance(name, basestring):
+ return [element for element in generator
+ if isinstance(element, Tag) and element.name == name]
+ else:
+ strainer = SoupStrainer(name, attrs, text, **kwargs)
else:
# Build a SoupStrainer
strainer = SoupStrainer(name, attrs, text, **kwargs)
@@ -261,35 +299,35 @@ class PageElement(object):
@property
def next_elements(self):
i = self
- while i:
+ while i is not None:
i = i.next
yield i
@property
def next_siblings(self):
i = self
- while i:
- i = i.nextSibling
+ while i is not None:
+ i = i.next_sibling
yield i
@property
def previous_elements(self):
i = self
- while i:
+ while i is not None:
i = i.previous
yield i
@property
def previous_siblings(self):
i = self
- while i:
- i = i.previousSibling
+ while i is not None:
+ i = i.previous_sibling
yield i
@property
def parents(self):
i = self
- while i:
+ while i is not None:
i = i.parent
yield i
@@ -343,7 +381,8 @@ class NavigableString(unicode, PageElement):
if attr == 'string':
return self
else:
- raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
+ raise AttributeError("'%s' object has no attribute '%s'" %
+ (self.__class__.__name__, attr))
def output_ready(self, substitute_html_entities=False):
if substitute_html_entities:
@@ -402,9 +441,9 @@ class Tag(PageElement):
# We don't actually store the parser object: that lets extracted
# chunks be garbage-collected.
- self.parserClass = parser.__class__
+ self.parser_class = parser.__class__
self.name = name
- if attrs == None:
+ if attrs is None:
attrs = {}
else:
attrs = dict(attrs)
@@ -418,6 +457,8 @@ class Tag(PageElement):
self.can_be_empty_element = builder.can_be_empty_element(name)
+ parserClass = _alias("parser_class") # BS3
+
@property
def is_empty_element(self):
"""Is this tag an empty-element tag? (aka a self-closing tag)
@@ -434,8 +475,7 @@ class Tag(PageElement):
then any tag with no contents is an empty-element tag.
"""
return len(self.contents) == 0 and self.can_be_empty_element
- isSelfClosing = is_empty_element # BS3
-
+ isSelfClosing = is_empty_element # BS3
@property
def string(self):
@@ -454,6 +494,60 @@ class Tag(PageElement):
return child
return child.string
+ @string.setter
+ def string(self, string):
+ self.clear()
+ self.append(string)
+
+ def get_text(self, separator=u"", strip=False):
+ """
+ Get all child strings, concatenated using the given separator
+ """
+ if strip:
+ return separator.join(string.strip()
+ for string in self.recursive_children
+ if isinstance(string, NavigableString) and string.strip())
+ else:
+ return separator.join(string
+ for string in self.recursive_children
+ if isinstance(string, NavigableString))
+ getText = get_text
+
+ text = property(get_text)
+
+ def decompose(self):
+ """Recursively destroys the contents of this tree."""
+ self.extract()
+ i = self
+ while i is not None:
+ next = i.next
+ i.__dict__.clear()
+ i = next
+
+ def clear(self, decompose=False):
+ """
+ Extract all children. If decompose is True, decompose instead.
+ """
+ if decompose:
+ for element in self.contents[:]:
+ if isinstance(element, Tag):
+ element.decompose()
+ else:
+ element.extract()
+ else:
+ for element in self.contents[:]:
+ element.extract()
+
+ def index(self, element):
+ """
+ Find the index of a child by identity, not value. Avoids issues with
+ tag.contents.index(element) getting the index of equal elements.
+ """
+ for i, child in enumerate(self.contents):
+ if child is element:
+ return i
+ raise ValueError("Tag.index: element not in tag")
+
def get(self, key, default=None):
"""Returns the value of the 'key' attribute for the tag, or
the value given for 'default' if it doesn't have that
@@ -461,7 +555,7 @@ class Tag(PageElement):
return self.attrs.get(key, default)
def has_key(self, key):
- return self.attrs.has_key(key)
+ return key in self.attrs
def __getitem__(self, key):
"""tag[key] returns the value of the 'key' attribute for the tag,
@@ -490,8 +584,7 @@ class Tag(PageElement):
def __delitem__(self, key):
"Deleting tag[key] deletes all 'key' attributes for the tag."
- if self.attrs.has_key(key):
- del self.attrs[key]
+ self.attrs.pop(key, None)
def __call__(self, *args, **kwargs):
"""Calling a tag like a function is the same as calling its
@@ -501,19 +594,27 @@ class Tag(PageElement):
def __getattr__(self, tag):
#print "Getattr %s.%s" % (self.__class__, tag)
- if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
+ if len(tag) > 3 and tag.endswith('Tag'):
return self.find(tag[:-3])
- elif tag.find('__') != 0:
+ elif not tag.startswith("__"):
return self.find(tag)
- raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
+ raise AttributeError("'%s' object has no attribute '%s'" %
+ (self.__class__, tag))
def __eq__(self, other):
"""Returns true iff this tag has the same name, the same attributes,
and the same contents (recursively) as the given tag."""
- if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
+ if self is other:
+ return True
+ if (not hasattr(other, 'name') or
+ not hasattr(other, 'attrs') or
+ not hasattr(other, 'contents') or
+ self.name != other.name or
+ self.attrs != other.attrs or
+ len(self) != len(other)):
return False
- for i in range(0, len(self.contents)):
- if self.contents[i] != other.contents[i]:
+ for i, my_child in enumerate(self.contents):
+ if my_child != other.contents[i]:
return False
return True
@@ -574,7 +675,7 @@ class Tag(PageElement):
pretty_print = (indent_level is not None)
if pretty_print:
- space = (' ' * (indent_level-1))
+ space = (' ' * (indent_level - 1))
indent_contents = indent_level + 1
else:
space = ''
@@ -587,12 +688,12 @@ class Tag(PageElement):
s = contents
else:
s = []
- attributeString = ''
+ attribute_string = ''
if attrs:
- attributeString = ' ' + ' '.join(attrs)
+ attribute_string = ' ' + ' '.join(attrs)
if pretty_print:
s.append(space)
- s.append('<%s%s%s>' % (self.name, attributeString, close))
+ s.append('<%s%s%s>' % (self.name, attribute_string, close))
if pretty_print:
s.append("\n")
s.append(contents)
@@ -601,21 +702,11 @@ class Tag(PageElement):
if pretty_print and closeTag:
s.append(space)
s.append(closeTag)
- if pretty_print and closeTag and self.nextSibling:
+ if pretty_print and closeTag and self.next_sibling:
s.append("\n")
s = ''.join(s)
return s
- def decompose(self):
- """Recursively destroys the contents of this tree."""
- contents = [i for i in self.contents]
- for i in contents:
- if isinstance(i, Tag):
- i.decompose()
- else:
- i.extract()
- self.extract()
-
def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
return self.encode(encoding, True)
@@ -632,7 +723,7 @@ class Tag(PageElement):
encoding.
"""
pretty_print = (indent_level is not None)
- s=[]
+ s = []
for c in self:
text = None
if isinstance(c, NavigableString):
@@ -644,7 +735,7 @@ class Tag(PageElement):
text = text.strip()
if text:
if pretty_print:
- s.append(" " * (indent_level-1))
+ s.append(" " * (indent_level - 1))
s.append(text)
if pretty_print:
s.append("\n")
@@ -678,18 +769,19 @@ class Tag(PageElement):
if not recursive:
generator = self.children
return self._find_all(name, attrs, text, limit, generator, **kwargs)
- findAll = find_all # BS3
- findChildren = find_all # BS2
+ findAll = find_all # BS3
+ findChildren = find_all # BS2
#Generator methods
@property
def children(self):
- return iter(self.contents) # XXX This seems to be untested.
+ # return iter() to make the purpose of the method clear
+ return iter(self.contents) # XXX This seems to be untested.
@property
def recursive_children(self):
if not len(self.contents):
- raise StopIteration # XXX return instead?
+ return
stopNode = self._last_recursive_child().next
current = self.contents[0]
while current is not stopNode:
@@ -712,7 +804,7 @@ class SoupStrainer(object):
def __init__(self, name=None, attrs={}, text=None, **kwargs):
self.name = name
if isinstance(attrs, basestring):
- kwargs['class'] = attrs
+ kwargs['class'] = _match_css_class(attrs)
attrs = None
if kwargs:
if attrs:
@@ -729,42 +821,43 @@ class SoupStrainer(object):
else:
return "%s|%s" % (self.name, self.attrs)
- def searchTag(self, markupName=None, markupAttrs={}):
+ def search_tag(self, markup_name=None, markup_attrs={}):
found = None
markup = None
- if isinstance(markupName, Tag):
- markup = markupName
- markupAttrs = markup
- callFunctionWithTagData = callable(self.name) \
- and not isinstance(markupName, Tag)
+ if isinstance(markup_name, Tag):
+ markup = markup_name
+ markup_attrs = markup
+ call_function_with_tag_data = callable(self.name) \
+ and not isinstance(markup_name, Tag)
if (not self.name) \
- or callFunctionWithTagData \
+ or call_function_with_tag_data \
or (markup and self._matches(markup, self.name)) \
- or (not markup and self._matches(markupName, self.name)):
- if callFunctionWithTagData:
- match = self.name(markupName, markupAttrs)
+ or (not markup and self._matches(markup_name, self.name)):
+ if call_function_with_tag_data:
+ match = self.name(markup_name, markup_attrs)
else:
match = True
- markupAttrMap = None
- for attr, matchAgainst in self.attrs.items():
- if not markupAttrMap:
- if hasattr(markupAttrs, 'get'):
- markupAttrMap = markupAttrs
- else:
- markupAttrMap = {}
- for k,v in markupAttrs:
- markupAttrMap[k] = v
- attrValue = markupAttrMap.get(attr)
- if not self._matches(attrValue, matchAgainst):
+ markup_attr_map = None
+ for attr, match_against in self.attrs.items():
+ if not markup_attr_map:
+ if hasattr(markup_attrs, 'get'):
+ markup_attr_map = markup_attrs
+ else:
+ markup_attr_map = {}
+ for k, v in markup_attrs:
+ markup_attr_map[k] = v
+ attr_value = markup_attr_map.get(attr)
+ if not self._matches(attr_value, match_against):
match = False
break
if match:
if markup:
found = markup
else:
- found = markupName
+ found = markup_name
return found
+ searchTag = search_tag
def search(self, markup):
#print 'looking for %s in %s' % (self, markup)
@@ -781,24 +874,24 @@ class SoupStrainer(object):
# Don't bother with Tags if we're searching for text.
elif isinstance(markup, Tag):
if not self.text:
- found = self.searchTag(markup)
+ found = self.search_tag(markup)
# If it's text, make sure the text matches.
elif isinstance(markup, NavigableString) or \
isinstance(markup, basestring):
if self._matches(markup, self.text):
found = markup
else:
- raise Exception, "I don't know how to match against a %s" \
- % markup.__class__
+ raise Exception("I don't know how to match against a %s"
+ % markup.__class__)
return found
- def _matches(self, markup, matchAgainst):
- #print "Matching %s against %s" % (markup, matchAgainst)
+ def _matches(self, markup, match_against):
+ #print "Matching %s against %s" % (markup, match_against)
result = False
- if matchAgainst == True and type(matchAgainst) == types.BooleanType:
- result = markup != None
- elif callable(matchAgainst):
- result = matchAgainst(markup)
+ if match_against is True:
+ result = markup is not None
+ elif callable(match_against):
+ result = match_against(markup)
else:
#Custom match methods take the tag as an argument, but all
#other ways of matching match the tag name as a string.
@@ -807,23 +900,23 @@ class SoupStrainer(object):
if markup is not None and not isinstance(markup, basestring):
markup = unicode(markup)
#Now we know that chunk is either a string, or None.
- if hasattr(matchAgainst, 'match'):
+ if hasattr(match_against, 'match'):
# It's a regexp object.
- result = markup and matchAgainst.search(markup)
- elif (isList(matchAgainst)
+ result = markup and match_against.search(markup)
+ elif (isList(match_against)
and (markup is not None
- or not isinstance(matchAgainst, basestring))):
- result = markup in matchAgainst
- elif hasattr(matchAgainst, 'items'):
- result = markup.has_key(matchAgainst)
- elif matchAgainst and isinstance(markup, basestring):
+ or not isinstance(match_against, basestring))):
+ result = markup in match_against
+ elif hasattr(match_against, 'items'):
+ result = match_against in markup
+ elif match_against and isinstance(markup, basestring):
if isinstance(markup, unicode):
- matchAgainst = unicode(matchAgainst)
+ match_against = unicode(match_against)
else:
- matchAgainst = str(matchAgainst)
+ match_against = str(match_against)
if not result:
- result = matchAgainst == markup
+ result = match_against == markup
return result
diff --git a/bs4/testing.py b/bs4/testing.py
index 9d9c26a..91c623a 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
from bs4.element import Comment, SoupStrainer
from bs4.builder import LXMLTreeBuilder
+
class SoupTest(unittest.TestCase):
@property
@@ -30,8 +31,3 @@ class SoupTest(unittest.TestCase):
compare_parsed_to = to_parse
self.assertEquals(obj.decode(), self.document_for(compare_parsed_to))
-
-
-
-
-
diff --git a/bs4/util.py b/bs4/util.py
new file mode 100644
index 0000000..8e33273
--- /dev/null
+++ b/bs4/util.py
@@ -0,0 +1,23 @@
+# Helper functions and mixin classes for Beautiful Soup
+
+import types
+try:
+ set
+except NameError:
+ from sets import Set as set
+
+
+def isList(l):
+ """Convenience method that works with all 2.x versions of Python
+ to determine whether or not something is listlike."""
+ return ((hasattr(l, '__iter__') and not isinstance(l, basestring))
+ or (type(l) in (types.ListType, types.TupleType)))
+
+
+def buildSet(args=None):
+ """Turns a list or a string into a set."""
+ if isinstance(args, str):
+ return set([args])
+ if args is None:
+ return set()
+ return set(args)
diff --git a/tests/test_tree.py b/tests/test_tree.py
index f2989fe..1718c6a 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -179,10 +179,13 @@ class TestFindAllByAttribute(TreeTest):
tree = self.soup("""
<a class="1">Class 1.</a>
<a class="2">Class 2.</a>
- <b class="1">Class 1.</a>
+ <b class="1">Class 1.</b>
+ <c class="3 4">Class 3 and 4.</c>
""")
self.assertSelects(tree.find_all('a', '1'), ['Class 1.'])
self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.'])
+ self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.'])
+ self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.'])
def test_find_all_by_attribute_soupstrainer(self):
tree = self.soup("""
@@ -242,6 +245,24 @@ class TestFindAllByAttribute(TreeTest):
["One a.", "Two as."])
+class TestIndex(TreeTest):
+ """Test Tag.index"""
+ def test_index(self):
+ tree = self.soup("""<wrap>
+ <a>Identical</a>
+ <b>Not identical</b>
+ <a>Identical</a>
+
+ <c><d>Identical with child</d></c>
+ <b>Also not identical</b>
+ <c><d>Identical with child</d></c>
+ </wrap>""")
+ wrap = tree.wrap
+ for i, element in enumerate(wrap.contents):
+ self.assertEqual(i, wrap.index(element))
+ self.assertRaises(ValueError, tree.index, 1)
+
+
class TestParentOperations(TreeTest):
"""Test navigation and searching through an element's parents."""
@@ -429,23 +450,23 @@ class TestNextSibling(SiblingTest):
self.start = self.tree.find(id="1")
def test_next_sibling_of_root_is_none(self):
- self.assertEquals(self.tree.nextSibling, None)
+ self.assertEquals(self.tree.next_sibling, None)
def test_next_sibling(self):
- self.assertEquals(self.start.nextSibling['id'], '2')
- self.assertEquals(self.start.nextSibling.nextSibling['id'], '3')
+ self.assertEquals(self.start.next_sibling['id'], '2')
+ self.assertEquals(self.start.next_sibling.next_sibling['id'], '3')
- # Note the difference between nextSibling and next.
+ # Note the difference between next_sibling and next.
self.assertEquals(self.start.next['id'], '1.1')
def test_next_sibling_may_not_exist(self):
- self.assertEquals(self.tree.html.nextSibling, None)
+ self.assertEquals(self.tree.html.next_sibling, None)
nested_span = self.tree.find(id="1.1")
- self.assertEquals(nested_span.nextSibling, None)
+ self.assertEquals(nested_span.next_sibling, None)
last_span = self.tree.find(id="4")
- self.assertEquals(last_span.nextSibling, None)
+ self.assertEquals(last_span.next_sibling, None)
def test_find_next_sibling(self):
self.assertEquals(self.start.find_next_sibling('span')['id'], '2')
@@ -459,8 +480,8 @@ class TestNextSibling(SiblingTest):
def test_next_sibling_for_text_element(self):
soup = self.soup("Foo<b>bar</b>baz")
start = soup.find(text="Foo")
- self.assertEquals(start.nextSibling.name, 'b')
- self.assertEquals(start.nextSibling.nextSibling, 'baz')
+ self.assertEquals(start.next_sibling.name, 'b')
+ self.assertEquals(start.next_sibling.next_sibling, 'baz')
self.assertSelects(start.find_next_siblings('b'), ['bar'])
self.assertEquals(start.find_next_sibling(text="baz"), "baz")
@@ -474,23 +495,23 @@ class TestPreviousSibling(SiblingTest):
self.end = self.tree.find(id="4")
def test_previous_sibling_of_root_is_none(self):
- self.assertEquals(self.tree.previousSibling, None)
+ self.assertEquals(self.tree.previous_sibling, None)
def test_previous_sibling(self):
- self.assertEquals(self.end.previousSibling['id'], '3')
- self.assertEquals(self.end.previousSibling.previousSibling['id'], '2')
+ self.assertEquals(self.end.previous_sibling['id'], '3')
+ self.assertEquals(self.end.previous_sibling.previous_sibling['id'], '2')
- # Note the difference between previousSibling and previous.
+ # Note the difference between previous_sibling and previous.
self.assertEquals(self.end.previous['id'], '3.1')
def test_previous_sibling_may_not_exist(self):
- self.assertEquals(self.tree.html.previousSibling, None)
+ self.assertEquals(self.tree.html.previous_sibling, None)
nested_span = self.tree.find(id="1.1")
- self.assertEquals(nested_span.previousSibling, None)
+ self.assertEquals(nested_span.previous_sibling, None)
first_span = self.tree.find(id="1")
- self.assertEquals(first_span.previousSibling, None)
+ self.assertEquals(first_span.previous_sibling, None)
def test_find_previous_sibling(self):
self.assertEquals(self.end.find_previous_sibling('span')['id'], '3')
@@ -504,8 +525,8 @@ class TestPreviousSibling(SiblingTest):
def test_previous_sibling_for_text_element(self):
soup = self.soup("Foo<b>bar</b>baz")
start = soup.find(text="baz")
- self.assertEquals(start.previousSibling.name, 'b')
- self.assertEquals(start.previousSibling.previousSibling, 'Foo')
+ self.assertEquals(start.previous_sibling.name, 'b')
+ self.assertEquals(start.previous_sibling.previous_sibling, 'Foo')
self.assertSelects(start.find_previous_siblings('b'), ['bar'])
self.assertEquals(start.find_previous_sibling(text="Foo"), "Foo")
@@ -536,22 +557,22 @@ class TestTreeModification(SoupTest):
'<body><a href="http://foo.com/"></a><ol></ol></body>')
def test_append_to_contents_moves_tag(self):
- doc = """<p id="1">Don't leave me <b>here</b>.</p>
+ doc = """<p id="1">Don't leave me <b>here</b>.</p>
<p id="2">Don\'t leave!</p>"""
- soup = self.soup(doc)
- second_para = soup.find(id='2')
- bold = soup.b
+ soup = self.soup(doc)
+ second_para = soup.find(id='2')
+ bold = soup.b
- # Move the <b> tag to the end of the second paragraph.
- soup.find(id='2').append(soup.b)
+ # Move the <b> tag to the end of the second paragraph.
+ soup.find(id='2').append(soup.b)
- # The <b> tag is now a child of the second paragraph.
- self.assertEqual(bold.parent, second_para)
+ # The <b> tag is now a child of the second paragraph.
+ self.assertEqual(bold.parent, second_para)
- self.assertEqual(
- soup.decode(), self.document_for(
- '<p id="1">Don\'t leave me .</p>\n'
- '<p id="2">Don\'t leave!<b>here</b></p>'))
+ self.assertEqual(
+ soup.decode(), self.document_for(
+ '<p id="1">Don\'t leave me .</p>\n'
+ '<p id="2">Don\'t leave!<b>here</b></p>'))
def test_replace_tag_with_itself(self):
text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
@@ -585,13 +606,12 @@ class TestTreeModification(SoupTest):
self.assertEqual(new_text.previous, "Argh!")
self.assertEqual(new_text.previous.next, new_text)
- self.assertEqual(new_text.previousSibling, "Argh!")
- self.assertEqual(new_text.previousSibling.nextSibling, new_text)
+ self.assertEqual(new_text.previous_sibling, "Argh!")
+ self.assertEqual(new_text.previous_sibling.next_sibling, new_text)
- self.assertEqual(new_text.nextSibling, None)
+ self.assertEqual(new_text.next_sibling, None)
self.assertEqual(new_text.next, soup.c)
-
def test_insert_tag(self):
builder = self.default_builder
soup = self.soup(
@@ -606,16 +626,16 @@ class TestTreeModification(SoupTest):
# Make sure all the relationships are hooked up correctly.
b_tag = soup.b
- self.assertEqual(b_tag.nextSibling, magic_tag)
- self.assertEqual(magic_tag.previousSibling, b_tag)
+ self.assertEqual(b_tag.next_sibling, magic_tag)
+ self.assertEqual(magic_tag.previous_sibling, b_tag)
find = b_tag.find(text="Find")
self.assertEqual(find.next, magic_tag)
self.assertEqual(magic_tag.previous, find)
c_tag = soup.c
- self.assertEqual(magic_tag.nextSibling, c_tag)
- self.assertEqual(c_tag.previousSibling, magic_tag)
+ self.assertEqual(magic_tag.next_sibling, c_tag)
+ self.assertEqual(c_tag.previous_sibling, magic_tag)
the = magic_tag.find(text="the")
self.assertEqual(the.parent, magic_tag)
@@ -644,7 +664,7 @@ class TestTreeModification(SoupTest):
self.assertEquals(show.parent, None)
self.assertEquals(no.parent, soup.p)
self.assertEquals(no.next, "no")
- self.assertEquals(no.nextSibling, " business")
+ self.assertEquals(no.next_sibling, " business")
def test_nested_tag_replace_with(self):
soup = self.soup(
@@ -664,23 +684,31 @@ class TestTreeModification(SoupTest):
self.assertEqual(remove_tag.parent, None)
self.assertEqual(remove_tag.find(text="right").next, None)
self.assertEqual(remove_tag.previous, None)
- self.assertEqual(remove_tag.nextSibling, None)
- self.assertEqual(remove_tag.previousSibling, None)
+ self.assertEqual(remove_tag.next_sibling, None)
+ self.assertEqual(remove_tag.previous_sibling, None)
# The <f> tag is now connected to the <a> tag.
self.assertEqual(move_tag.parent, soup.a)
self.assertEqual(move_tag.previous, "We")
self.assertEqual(move_tag.next.next, soup.e)
- self.assertEqual(move_tag.nextSibling, None)
+ self.assertEqual(move_tag.next_sibling, None)
# The gap where the <f> tag used to be has been mended, and
# the word "to" is now connected to the <g> tag.
to_text = soup.find(text="to")
g_tag = soup.g
self.assertEqual(to_text.next, g_tag)
- self.assertEqual(to_text.nextSibling, g_tag)
+ self.assertEqual(to_text.next_sibling, g_tag)
self.assertEqual(g_tag.previous, to_text)
- self.assertEqual(g_tag.previousSibling, to_text)
+ self.assertEqual(g_tag.previous_sibling, to_text)
+
+ def test_replace_with_children(self):
+ tree = self.soup("""
+ <p>Unneeded <em>formatting</em> is unneeded</p>
+ """)
+ tree.em.replace_with_children()
+ self.assertEqual(tree.em, None)
+ self.assertEqual(tree.p.text, "Unneeded formatting is unneeded")
def test_extract(self):
soup = self.soup(
@@ -703,9 +731,31 @@ class TestTreeModification(SoupTest):
content_1 = soup.find(text="Some content. ")
content_2 = soup.find(text=" More content.")
self.assertEquals(content_1.next, content_2)
- self.assertEquals(content_1.nextSibling, content_2)
+ self.assertEquals(content_1.next_sibling, content_2)
self.assertEquals(content_2.previous, content_1)
- self.assertEquals(content_2.previousSibling, content_1)
+ self.assertEquals(content_2.previous_sibling, content_1)
+
+ def test_clear(self):
+ """Tag.clear()"""
+ soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>")
+ # clear using extract()
+ a = soup.a
+ soup.p.clear()
+ self.assertEqual(len(soup.p.contents), 0)
+ self.assertTrue(hasattr(a, "contents"))
+
+ # clear using decompose()
+ em = a.em
+ a.clear(decompose=True)
+ self.assertFalse(hasattr(em, "contents"))
+
+ def test_string_set(self):
+ """Tag.string = 'string'"""
+ soup = self.soup("<a></a> <b><c></c></b>")
+ soup.a.string = "foo"
+ self.assertEqual(soup.a.contents, ["foo"])
+ soup.b.string = "bar"
+ self.assertEqual(soup.b.contents, ["bar"])
class TestElementObjects(SoupTest):
@@ -781,7 +831,6 @@ class TestElementObjects(SoupTest):
self.assertEqual(soup.a.string, "foo")
self.assertEqual(soup.string, "foo")
-
def test_lack_of_string(self):
"""Only a tag containing a single text node has a .string."""
soup = self.soup("<b>f<i>e</i>o</b>")
@@ -790,6 +839,14 @@ class TestElementObjects(SoupTest):
soup = self.soup("<b></b>")
self.assertFalse(soup.b.string)
+ def test_all_text(self):
+ """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated"""
+ soup = self.soup("<a>a<b>r</b> <r> t </r></a>")
+ self.assertEqual(soup.a.text, "ar t ")
+ self.assertEqual(soup.a.get_text(strip=True), "art")
+ self.assertEqual(soup.a.get_text(","), "a,r, , t ")
+ self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
+
class TestPersistence(SoupTest):
"Testing features like pickle and deepcopy."