summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bs4/__init__.py9
-rw-r--r--bs4/dammit.py91
-rw-r--r--bs4/element.py89
-rw-r--r--bs4/testing.py6
-rw-r--r--bs4/util.py23
-rw-r--r--tests/test_tree.py26
6 files changed, 134 insertions, 110 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index a1d7c90..66a1c02 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -65,7 +65,7 @@ class BeautifulSoup(Tag):
# can be replaced with a single space. A text node that contains
# fancy Unicode spaces (usually non-breaking) should be left
# alone.
- STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
+ STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None):
@@ -167,7 +167,6 @@ class BeautifulSoup(Tag):
self.previous = o
self.currentTag.contents.append(o)
-
def _popToTag(self, name, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
instance of the given tag. If inclusivePop is false, pops the tag
@@ -179,9 +178,10 @@ class BeautifulSoup(Tag):
numPops = 0
mostRecentTag = None
- for i in range(len(self.tagStack)-1, 0, -1):
+
+ for i in range(len(self.tagStack) - 1, 0, -1):
if name == self.tagStack[i].name:
- numPops = len(self.tagStack)-i
+ numPops = len(self.tagStack) - i
break
if not inclusivePop:
numPops = numPops - 1
@@ -217,7 +217,6 @@ class BeautifulSoup(Tag):
self.pushTag(tag)
return tag
-
def handle_endtag(self, name):
#print "End tag: " + name
self.endData()
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 75d445e..f3e770e 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -42,7 +42,7 @@ class EntitySubstitution(object):
# There's no point in turning the quotation mark into
# ", unless it happens within an attribute value, which
# is handled elsewhere.
- continue;
+ continue
character = unichr(codepoint)
characters.append(character)
lookup[character] = name
@@ -52,13 +52,12 @@ class EntitySubstitution(object):
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
-
CHARACTER_TO_XML_ENTITY = {
- "'" : "apos",
- '"' : "quot",
- "&" : "amp",
- "<" : "lt",
- ">" : "gt",
+ "'": "apos",
+ '"': "quot",
+ "&": "amp",
+ "<": "lt",
+ ">": "gt",
}
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
@@ -157,8 +156,8 @@ class UnicodeDammit:
# meta tags to the corresponding Python codec names. It only covers
# values that aren't in Python's aliases and can't be determined
# by the heuristics in find_codec.
- CHARSET_ALIASES = { "macintosh" : "mac-roman",
- "x-sjis" : "shift-jis" }
+ CHARSET_ALIASES = {"macintosh": "mac-roman",
+ "x-sjis": "shift-jis"}
ENCODINGS_WITH_SMART_QUOTES = [
"windows-1252",
@@ -198,7 +197,8 @@ class UnicodeDammit:
break
self.unicode = u
- if not u: self.original_encoding = None
+ if not u:
+ self.original_encoding = None
def _sub_ms_char(self, match):
"""Changes a MS smart quote character to an XML or HTML
@@ -335,7 +335,6 @@ class UnicodeDammit:
xml_encoding = sniffed_xml_encoding
return xml_data, xml_encoding, sniffed_xml_encoding
-
def find_codec(self, charset):
return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
or (charset and self._codec(charset.replace("-", ""))) \
@@ -343,7 +342,8 @@ class UnicodeDammit:
or charset
def _codec(self, charset):
- if not charset: return charset
+ if not charset:
+ return charset
codec = None
try:
codecs.lookup(charset)
@@ -353,6 +353,7 @@ class UnicodeDammit:
return codec
EBCDIC_TO_ASCII_MAP = None
+
def _ebcdic_to_ascii(self, s):
c = self.__class__
if not c.EBCDIC_TO_ASCII_MAP:
@@ -374,39 +375,39 @@ class UnicodeDammit:
90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
250,251,252,253,254,255)
import string
- c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
+ c.EBCDIC_TO_ASCII_MAP = string.maketrans(
''.join(map(chr, range(256))), ''.join(map(chr, emap)))
return s.translate(c.EBCDIC_TO_ASCII_MAP)
- MS_CHARS = { '\x80' : ('euro', '20AC'),
- '\x81' : ' ',
- '\x82' : ('sbquo', '201A'),
- '\x83' : ('fnof', '192'),
- '\x84' : ('bdquo', '201E'),
- '\x85' : ('hellip', '2026'),
- '\x86' : ('dagger', '2020'),
- '\x87' : ('Dagger', '2021'),
- '\x88' : ('circ', '2C6'),
- '\x89' : ('permil', '2030'),
- '\x8A' : ('Scaron', '160'),
- '\x8B' : ('lsaquo', '2039'),
- '\x8C' : ('OElig', '152'),
- '\x8D' : '?',
- '\x8E' : ('#x17D', '17D'),
- '\x8F' : '?',
- '\x90' : '?',
- '\x91' : ('lsquo', '2018'),
- '\x92' : ('rsquo', '2019'),
- '\x93' : ('ldquo', '201C'),
- '\x94' : ('rdquo', '201D'),
- '\x95' : ('bull', '2022'),
- '\x96' : ('ndash', '2013'),
- '\x97' : ('mdash', '2014'),
- '\x98' : ('tilde', '2DC'),
- '\x99' : ('trade', '2122'),
- '\x9a' : ('scaron', '161'),
- '\x9b' : ('rsaquo', '203A'),
- '\x9c' : ('oelig', '153'),
- '\x9d' : '?',
- '\x9e' : ('#x17E', '17E'),
- '\x9f' : ('Yuml', ''),}
+ MS_CHARS = {'\x80': ('euro', '20AC'),
+ '\x81': ' ',
+ '\x82': ('sbquo', '201A'),
+ '\x83': ('fnof', '192'),
+ '\x84': ('bdquo', '201E'),
+ '\x85': ('hellip', '2026'),
+ '\x86': ('dagger', '2020'),
+ '\x87': ('Dagger', '2021'),
+ '\x88': ('circ', '2C6'),
+ '\x89': ('permil', '2030'),
+ '\x8A': ('Scaron', '160'),
+ '\x8B': ('lsaquo', '2039'),
+ '\x8C': ('OElig', '152'),
+ '\x8D': '?',
+ '\x8E': ('#x17D', '17D'),
+ '\x8F': '?',
+ '\x90': '?',
+ '\x91': ('lsquo', '2018'),
+ '\x92': ('rsquo', '2019'),
+ '\x93': ('ldquo', '201C'),
+ '\x94': ('rdquo', '201D'),
+ '\x95': ('bull', '2022'),
+ '\x96': ('ndash', '2013'),
+ '\x97': ('mdash', '2014'),
+ '\x98': ('tilde', '2DC'),
+ '\x99': ('trade', '2122'),
+ '\x9a': ('scaron', '161'),
+ '\x9b': ('rsaquo', '203A'),
+ '\x9c': ('oelig', '153'),
+ '\x9d': '?',
+ '\x9e': ('#x17E', '17E'),
+ '\x9f': ('Yuml', ''),}
diff --git a/bs4/element.py b/bs4/element.py
index f9b475b..e141aa8 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -44,8 +44,8 @@ class PageElement(object):
self.previous_sibling = self.parent.contents[-1]
self.previous_sibling.next_sibling = self
- nextSibling = _alias("next_sibling") # BS3
- previousSibling = _alias("previous_sibling") # BS3
+ nextSibling = _alias("next_sibling") # BS3
+ previousSibling = _alias("previous_sibling") # BS3
def replace_with(self, replace_with):
if replace_with is self:
@@ -62,7 +62,7 @@ class PageElement(object):
my_index -= 1
self.extract()
old_parent.insert(my_index, replace_with)
- replaceWith = replace_with # BS3
+ replaceWith = replace_with # BS3
def replace_with_children(self):
my_parent = self.parent
@@ -70,7 +70,7 @@ class PageElement(object):
self.extract()
for child in reversed(self.contents[:]):
my_parent.insert(my_index, child)
- replaceWithChildren = replace_with_children # BS3
+ replaceWithChildren = replace_with_children # BS3
def extract(self):
"""Destructively rips this element out of the tree."""
@@ -112,7 +112,7 @@ class PageElement(object):
and not isinstance(new_child, NavigableString)):
new_child = NavigableString(new_child)
- position = min(position, len(self.contents))
+ position = min(position, len(self.contents))
if hasattr(new_child, 'parent') and new_child.parent is not None:
# We're 'inserting' an element that's already one
# of this object's children.
@@ -148,7 +148,7 @@ class PageElement(object):
while not parents_next_sibling:
parents_next_sibling = parent.next_sibling
parent = parent.parent
- if not parent: # This is the last element in the document.
+ if not parent: # This is the last element in the document.
break
if parents_next_sibling:
new_childs_last_element.next = parents_next_sibling
@@ -173,7 +173,7 @@ class PageElement(object):
"""Returns the first item that matches the given criteria and
appears after this Tag in the document."""
return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
- findNext = find_next # BS3
+ findNext = find_next # BS3
def find_all_next(self, name=None, attrs={}, text=None, limit=None,
**kwargs):
@@ -181,14 +181,14 @@ class PageElement(object):
after this Tag in the document."""
return self._find_all(name, attrs, text, limit, self.next_elements,
**kwargs)
- findAllNext = find_all_next # BS3
+ findAllNext = find_all_next # BS3
def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the closest sibling to this Tag that matches the
given criteria and appears after this Tag in the document."""
return self._find_one(self.find_next_siblings, name, attrs, text,
**kwargs)
- findNextSibling = find_next_sibling # BS3
+ findNextSibling = find_next_sibling # BS3
def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
**kwargs):
@@ -196,15 +196,15 @@ class PageElement(object):
criteria and appear after this Tag in the document."""
return self._find_all(name, attrs, text, limit,
self.next_siblings, **kwargs)
- findNextSiblings = find_next_siblings # BS3
- fetchNextSiblings = find_next_siblings # BS2
+ findNextSiblings = find_next_siblings # BS3
+ fetchNextSiblings = find_next_siblings # BS2
def find_previous(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the first item that matches the given criteria and
appears before this Tag in the document."""
return self._find_one(
self.find_all_previous, name, attrs, text, **kwargs)
- findPrevious = find_previous # BS3
+ findPrevious = find_previous # BS3
def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
**kwargs):
@@ -212,15 +212,15 @@ class PageElement(object):
before this Tag in the document."""
return self._find_all(name, attrs, text, limit, self.previous_elements,
**kwargs)
- findAllPrevious = find_all_previous # BS3
- fetchPrevious = find_all_previous # BS2
+ findAllPrevious = find_all_previous # BS3
+ fetchPrevious = find_all_previous # BS2
def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the closest sibling to this Tag that matches the
given criteria and appears before this Tag in the document."""
return self._find_one(self.find_previous_siblings, name, attrs, text,
**kwargs)
- findPreviousSibling = find_previous_sibling # BS3
+ findPreviousSibling = find_previous_sibling # BS3
def find_previous_siblings(self, name=None, attrs={}, text=None,
limit=None, **kwargs):
@@ -228,8 +228,8 @@ class PageElement(object):
criteria and appear before this Tag in the document."""
return self._find_all(name, attrs, text, limit,
self.previous_siblings, **kwargs)
- findPreviousSiblings = find_previous_siblings # BS3
- fetchPreviousSiblings = find_previous_siblings # BS2
+ findPreviousSiblings = find_previous_siblings # BS3
+ fetchPreviousSiblings = find_previous_siblings # BS2
def find_parent(self, name=None, attrs={}, **kwargs):
"""Returns the closest parent of this Tag that matches the given
@@ -241,7 +241,7 @@ class PageElement(object):
if l:
r = l[0]
return r
- findParent = find_parent # BS3
+ findParent = find_parent # BS3
def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
"""Returns the parents of this Tag that match the given
@@ -249,8 +249,8 @@ class PageElement(object):
return self._find_all(name, attrs, None, limit, self.parents,
**kwargs)
- findParents = find_parents # BS3
- fetchParents = find_parents # BS2
+ findParents = find_parents # BS3
+ fetchParents = find_parents # BS2
#These methods do the real heavy lifting.
@@ -381,7 +381,8 @@ class NavigableString(unicode, PageElement):
if attr == 'string':
return self
else:
- raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
+ raise AttributeError("'%s' object has no attribute '%s'" %
+ (self.__class__.__name__, attr))
def output_ready(self, substitute_html_entities=False):
if substitute_html_entities:
@@ -456,7 +457,7 @@ class Tag(PageElement):
self.can_be_empty_element = builder.can_be_empty_element(name)
- parserClass = _alias("parser_class") # BS3
+ parserClass = _alias("parser_class") # BS3
@property
def is_empty_element(self):
@@ -474,8 +475,7 @@ class Tag(PageElement):
then any tag with no contents is an empty-element tag.
"""
return len(self.contents) == 0 and self.can_be_empty_element
- isSelfClosing = is_empty_element # BS3
-
+ isSelfClosing = is_empty_element # BS3
@property
def string(self):
@@ -555,7 +555,7 @@ class Tag(PageElement):
return self.attrs.get(key, default)
def has_key(self, key):
- return self.attrs.has_key(key)
+ return key in self.attrs
def __getitem__(self, key):
"""tag[key] returns the value of the 'key' attribute for the tag,
@@ -584,8 +584,7 @@ class Tag(PageElement):
def __delitem__(self, key):
"Deleting tag[key] deletes all 'key' attributes for the tag."
- if self.attrs.has_key(key):
- del self.attrs[key]
+ self.attrs.pop(key, None)
def __call__(self, *args, **kwargs):
"""Calling a tag like a function is the same as calling its
@@ -595,18 +594,24 @@ class Tag(PageElement):
def __getattr__(self, tag):
#print "Getattr %s.%s" % (self.__class__, tag)
- if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: # TODO: Can this be endswith?
+ if len(tag) > 3 and tag.endswith('Tag'):
return self.find(tag[:-3])
- elif tag.find('__') != 0: # TODO: Can this be not startswith?
+ elif not tag.startswith("__"):
return self.find(tag)
- raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
+ raise AttributeError("'%s' object has no attribute '%s'" %
+ (self.__class__, tag))
def __eq__(self, other):
"""Returns true iff this tag has the same name, the same attributes,
and the same contents (recursively) as the given tag."""
if self is other:
return True
- if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
+ if (not hasattr(other, 'name') or
+ not hasattr(other, 'attrs') or
+ not hasattr(other, 'contents') or
+ self.name != other.name or
+ self.attrs != other.attrs or
+ len(self) != len(other)):
return False
for i, my_child in enumerate(self.contents):
if my_child != other.contents[i]:
@@ -670,7 +675,7 @@ class Tag(PageElement):
pretty_print = (indent_level is not None)
if pretty_print:
- space = (' ' * (indent_level-1))
+ space = (' ' * (indent_level - 1))
indent_contents = indent_level + 1
else:
space = ''
@@ -718,7 +723,7 @@ class Tag(PageElement):
encoding.
"""
pretty_print = (indent_level is not None)
- s=[]
+ s = []
for c in self:
text = None
if isinstance(c, NavigableString):
@@ -730,7 +735,7 @@ class Tag(PageElement):
text = text.strip()
if text:
if pretty_print:
- s.append(" " * (indent_level-1))
+ s.append(" " * (indent_level - 1))
s.append(text)
if pretty_print:
s.append("\n")
@@ -764,14 +769,14 @@ class Tag(PageElement):
if not recursive:
generator = self.children
return self._find_all(name, attrs, text, limit, generator, **kwargs)
- findAll = find_all # BS3
- findChildren = find_all # BS2
+ findAll = find_all # BS3
+ findChildren = find_all # BS2
#Generator methods
@property
def children(self):
# return iter() to make the purpose of the method clear
- return iter(self.contents) # XXX This seems to be untested.
+ return iter(self.contents) # XXX This seems to be untested.
@property
def recursive_children(self):
@@ -836,11 +841,11 @@ class SoupStrainer(object):
markup_attr_map = None
for attr, match_against in self.attrs.items():
if not markup_attr_map:
- if hasattr(markup_attrs, 'get'):
+ if hasattr(markup_attrs, 'get'):
markup_attr_map = markup_attrs
- else:
+ else:
markup_attr_map = {}
- for k,v in markup_attrs:
+ for k, v in markup_attrs:
markup_attr_map[k] = v
attr_value = markup_attr_map.get(attr)
if not self._matches(attr_value, match_against):
@@ -876,8 +881,8 @@ class SoupStrainer(object):
if self._matches(markup, self.text):
found = markup
else:
- raise Exception, "I don't know how to match against a %s" \
- % markup.__class__
+ raise Exception("I don't know how to match against a %s"
+ % markup.__class__)
return found
def _matches(self, markup, match_against):
diff --git a/bs4/testing.py b/bs4/testing.py
index 9d9c26a..91c623a 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
from bs4.element import Comment, SoupStrainer
from bs4.builder import LXMLTreeBuilder
+
class SoupTest(unittest.TestCase):
@property
@@ -30,8 +31,3 @@ class SoupTest(unittest.TestCase):
compare_parsed_to = to_parse
self.assertEquals(obj.decode(), self.document_for(compare_parsed_to))
-
-
-
-
-
diff --git a/bs4/util.py b/bs4/util.py
new file mode 100644
index 0000000..8e33273
--- /dev/null
+++ b/bs4/util.py
@@ -0,0 +1,23 @@
+# Helper functions and mixin classes for Beautiful Soup
+
+import types
+try:
+ set
+except NameError:
+ from sets import Set as set
+
+
+def isList(l):
+ """Convenience method that works with all 2.x versions of Python
+ to determine whether or not something is listlike."""
+ return ((hasattr(l, '__iter__') and not isinstance(l, basestring))
+ or (type(l) in (types.ListType, types.TupleType)))
+
+
+def buildSet(args=None):
+ """Turns a list or a string into a set."""
+ if isinstance(args, str):
+ return set([args])
+ if args is None:
+ return set()
+ return set(args)
diff --git a/tests/test_tree.py b/tests/test_tree.py
index 3862d7d..a6ad000 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -557,22 +557,22 @@ class TestTreeModification(SoupTest):
'<body><a href="http://foo.com/"></a><ol></ol></body>')
def test_append_to_contents_moves_tag(self):
- doc = """<p id="1">Don't leave me <b>here</b>.</p>
+ doc = """<p id="1">Don't leave me <b>here</b>.</p>
<p id="2">Don\'t leave!</p>"""
- soup = self.soup(doc)
- second_para = soup.find(id='2')
- bold = soup.b
+ soup = self.soup(doc)
+ second_para = soup.find(id='2')
+ bold = soup.b
- # Move the <b> tag to the end of the second paragraph.
- soup.find(id='2').append(soup.b)
+ # Move the <b> tag to the end of the second paragraph.
+ soup.find(id='2').append(soup.b)
- # The <b> tag is now a child of the second paragraph.
- self.assertEqual(bold.parent, second_para)
+ # The <b> tag is now a child of the second paragraph.
+ self.assertEqual(bold.parent, second_para)
- self.assertEqual(
- soup.decode(), self.document_for(
- '<p id="1">Don\'t leave me .</p>\n'
- '<p id="2">Don\'t leave!<b>here</b></p>'))
+ self.assertEqual(
+ soup.decode(), self.document_for(
+ '<p id="1">Don\'t leave me .</p>\n'
+ '<p id="2">Don\'t leave!<b>here</b></p>'))
def test_replace_tag_with_itself(self):
text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
@@ -700,7 +700,7 @@ class TestTreeModification(SoupTest):
self.assertEqual(to_text.next, g_tag)
self.assertEqual(to_text.next_sibling, g_tag)
self.assertEqual(g_tag.previous, to_text)
- self.assertEqual(g_tag.previous_sibling, to_text)
+ self.assertEqual(g_tag.previous_sibling, to_text)
def test_replace_with_children(self):
tree = self.soup("""