From 927e5de747ed2d87b55eb6e8dad46d8598f27e09 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 21 May 2011 11:23:05 -0400 Subject: Changed dammit.py to require fewer changes to be Python 3 compatible. --- CHANGELOG | 1 + bs4/dammit.py | 13 ++++++------- tests/test_soup.py | 12 ++++++------ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 00d80da..cd01b3b 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -38,6 +38,7 @@ work. Here are the renames: Some attributes have also been renamed: * Tag.isSelfClosing -> Tag.is_empty_element + * UnicodeDammit.unicode -> UnicodeDammit.unicode_markup So have some arguments to popular methods: diff --git a/bs4/dammit.py b/bs4/dammit.py index 75d445e..4aafe81 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -9,7 +9,6 @@ encoding; that's the tree builder's job. import codecs from htmlentitydefs import codepoint2name import re -import types # Autodetects character encodings. Very useful. # Download from http://chardet.feedparser.org/ @@ -37,7 +36,7 @@ class EntitySubstitution(object): lookup = {} reverse_lookup = {} characters = [] - for codepoint, name in codepoint2name.items(): + for codepoint, name in list(codepoint2name.items()): if codepoint == 34: # There's no point in turning the quotation mark into # ", unless it happens within an attribute value, which @@ -175,7 +174,7 @@ class UnicodeDammit: self.tried_encodings = [] if markup == '' or isinstance(markup, unicode): self.original_encoding = None - self.unicode = unicode(markup) + self.unicode_markup = unicode(markup) return u = None @@ -197,7 +196,7 @@ class UnicodeDammit: if u: break - self.unicode = u + self.unicode_markup = u if not u: self.original_encoding = None def _sub_ms_char(self, match): @@ -205,7 +204,7 @@ class UnicodeDammit: entity.""" orig = match.group(1) sub = self.MS_CHARS.get(orig) - if type(sub) == types.TupleType: + if type(sub) == tuple: if self.smart_quotes_to == 'xml': sub = '&#x'.encode() + sub[1].encode() + ';'.encode() else: @@ -234,7 +233,7 @@ class UnicodeDammit: u = self._to_unicode(markup, proposed) self.markup = u self.original_encoding = proposed - except Exception, e: + except Exception as e: # print "That didn't work!" # print e return None @@ -375,7 +374,7 @@ class UnicodeDammit: 250,251,252,253,254,255) import string c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ - ''.join(map(chr, range(256))), ''.join(map(chr, emap))) + ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) return s.translate(c.EBCDIC_TO_ASCII_MAP) MS_CHARS = { '\x80' : ('euro', '20AC'), diff --git a/tests/test_soup.py b/tests/test_soup.py index d283b8a..87d6f3b 100644 --- a/tests/test_soup.py +++ b/tests/test_soup.py @@ -86,37 +86,37 @@ class TestUnicodeDammit(unittest.TestCase): markup = "\x91\x92\x93\x94" dammit = UnicodeDammit(markup) self.assertEquals( - dammit.unicode, u"\u2018\u2019\u201c\u201d") + dammit.unicode_markup, u"\u2018\u2019\u201c\u201d") def test_smart_quotes_to_xml_entities(self): markup = "\x91\x92\x93\x94" dammit = UnicodeDammit(markup, smart_quotes_to="xml") self.assertEquals( - dammit.unicode, "‘’“”") + dammit.unicode_markup, "‘’“”") def test_smart_quotes_to_html_entities(self): markup = "\x91\x92\x93\x94" dammit = UnicodeDammit(markup, smart_quotes_to="html") self.assertEquals( - dammit.unicode, "‘’“”") + dammit.unicode_markup, "‘’“”") def test_detect_utf8(self): utf8 = "\xc3\xa9" dammit = UnicodeDammit(utf8) - self.assertEquals(dammit.unicode, u'\xe9') + self.assertEquals(dammit.unicode_markup, u'\xe9') self.assertEquals(dammit.original_encoding, 'utf-8') def test_convert_hebrew(self): hebrew = "\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) self.assertEquals(dammit.original_encoding, 'iso-8859-8') - self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9') + self.assertEquals(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) self.assertEquals(dammit.original_encoding, 'utf-8') - self.assertEquals(dammit.unicode.encode("utf-8"), utf_8) + self.assertEquals(dammit.unicode_markup.encode("utf-8"), utf_8) def test_ignore_inappropriate_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") -- cgit v1.2.3 From 5045b8cfcdd7556a1e2c4a2999d4ed1108b0425a Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 21 May 2011 11:26:28 -0400 Subject: Reduced the difference between Python 2's __init__.py and Python 3's __init__.py. --- bs4/__init__.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/bs4/__init__.py b/bs4/__init__.py index 8baeec4..22ecc43 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -16,7 +16,6 @@ For more than you ever wanted to know about Beautiful Soup, see the documentation: http://www.crummy.com/software/BeautifulSoup/documentation.html """ -from __future__ import generators __author__ = "Leonard Richardson (leonardr@segfault.org)" __version__ = "4.0.0a" @@ -27,9 +26,9 @@ __all__ = ['BeautifulSoup'] import re -from builder import builder_registry -from dammit import UnicodeDammit -from element import DEFAULT_OUTPUT_ENCODING, NavigableString, Tag +from .builder import builder_registry +from .dammit import UnicodeDammit +from .element import DEFAULT_OUTPUT_ENCODING, NavigableString, Tag class BeautifulSoup(Tag): -- cgit v1.2.3 From bc568d5a817c9104d984215e036dad87506f6bfc Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 21 May 2011 11:57:46 -0400 Subject: Renamed .next and .previous to .next_element and .previous_element. --- CHANGELOG | 2 ++ bs4/__init__.py | 16 +++++++------- bs4/element.py | 64 +++++++++++++++++++++++++++++------------------------- tests/test_tree.py | 62 ++++++++++++++++++++++++++-------------------------- 4 files changed, 76 insertions(+), 68 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index cd01b3b..a636544 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -39,6 +39,8 @@ Some attributes have also been renamed: * Tag.isSelfClosing -> Tag.is_empty_element * UnicodeDammit.unicode -> UnicodeDammit.unicode_markup + * Tag.next -> Tag.next_element + * Tag.previous -> Tag.previous_element So have some arguments to popular methods: diff --git a/bs4/__init__.py b/bs4/__init__.py index 22ecc43..6406bef 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -159,10 +159,10 @@ class BeautifulSoup(Tag): def object_was_parsed(self, o): """Add an object to the parse tree.""" - o.setup(self.currentTag, self.previous) - if self.previous: - self.previous.next = o - self.previous = o + o.setup(self.currentTag, self.previous_element) + if self.previous_element: + self.previous_element.next_element = o + self.previous_element = o self.currentTag.contents.append(o) @@ -206,12 +206,12 @@ class BeautifulSoup(Tag): return None tag = Tag(self, self.builder, name, attrs, self.currentTag, - self.previous) + self.previous_element) if tag is None: return tag - if self.previous: - self.previous.next = tag - self.previous = tag + if self.previous_element: + self.previous_element.next_element = tag + self.previous_element = tag self.pushTag(tag) return tag diff --git a/bs4/element.py b/bs4/element.py index 587078c..729ab36 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1,9 +1,6 @@ +import collections import re import types -try: - from htmlentitydefs import name2codepoint -except ImportError: - name2codepoint = {} from bs4.dammit import EntitySubstitution DEFAULT_OUTPUT_ENCODING = "utf-8" @@ -13,12 +10,12 @@ class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" - def setup(self, parent=None, previous=None): + def setup(self, parent=None, previous_element=None): """Sets up the initial relations between this element and other elements.""" self.parent = parent - self.previous = previous - self.next = None + self.previous_element = previous_element + self.next_element = None self.previousSibling = None self.nextSibling = None if self.parent and self.parent.contents: @@ -52,14 +49,14 @@ class PageElement(object): #this element (and any children) hadn't been parsed. Connect #the two. lastChild = self._last_recursive_child() - nextElement = lastChild.next + nextElement = lastChild.next_element - if self.previous: - self.previous.next = nextElement + if self.previous_element: + self.previous_element.next_element = nextElement if nextElement: - nextElement.previous = self.previous - self.previous = None - lastChild.next = None + nextElement.previous_element = self.previous_element + self.previous_element = None + lastChild.next_element = None self.parent = None if self.previousSibling: @@ -100,14 +97,14 @@ class PageElement(object): previousChild = None if position == 0: newChild.previousSibling = None - newChild.previous = self + newChild.previous_element = self else: previousChild = self.contents[position-1] newChild.previousSibling = previousChild newChild.previousSibling.nextSibling = newChild - newChild.previous = previousChild._last_recursive_child() - if newChild.previous: - newChild.previous.next = newChild + newChild.previous_element = previousChild._last_recursive_child() + if newChild.previous_element: + newChild.previous_element.next_element = newChild newChildsLastElement = newChild._last_recursive_child() @@ -122,18 +119,18 @@ class PageElement(object): if not parent: # This is the last element in the document. break if parentsNextSibling: - newChildsLastElement.next = parentsNextSibling + newChildsLastElement.next_element = parentsNextSibling else: - newChildsLastElement.next = None + newChildsLastElement.next_element = None else: nextChild = self.contents[position] newChild.nextSibling = nextChild if newChild.nextSibling: newChild.nextSibling.previousSibling = newChild - newChildsLastElement.next = nextChild + newChildsLastElement.next_element = nextChild - if newChildsLastElement.next: - newChildsLastElement.next.previous = newChildsLastElement + if newChildsLastElement.next_element: + newChildsLastElement.next_element.previous_element = newChildsLastElement self.contents.insert(position, newChild) def append(self, tag): @@ -223,6 +220,14 @@ class PageElement(object): findParents = find_parents # BS3 fetchParents = find_parents # BS2 + @property + def next(self): + return self.next_element + + @property + def previous(self): + return self.previous_element + #These methods do the real heavy lifting. def _find_one(self, method, name, attrs, text, **kwargs): @@ -260,7 +265,7 @@ class PageElement(object): def next_elements(self): i = self while i: - i = i.next + i = i.next_element yield i @property @@ -274,7 +279,7 @@ class PageElement(object): def previous_elements(self): i = self while i: - i = i.previous + i = i.previous_element yield i @property @@ -688,11 +693,11 @@ class Tag(PageElement): def recursive_children(self): if not len(self.contents): raise StopIteration # XXX return instead? - stopNode = self._last_recursive_child().next + stopNode = self._last_recursive_child().next_element current = self.contents[0] while current is not stopNode: yield current - current = current.next + current = current.next_element # Old names for backwards compatibility def childGenerator(self): @@ -733,8 +738,9 @@ class SoupStrainer(object): if isinstance(markupName, Tag): markup = markupName markupAttrs = markup - callFunctionWithTagData = callable(self.name) \ - and not isinstance(markupName, Tag) + callFunctionWithTagData = ( + isinstance(self.name, collections.Callable) + and not isinstance(markupName, Tag)) if (not self.name) \ or callFunctionWithTagData \ @@ -795,7 +801,7 @@ class SoupStrainer(object): result = False if matchAgainst == True and type(matchAgainst) == types.BooleanType: result = markup != None - elif callable(matchAgainst): + elif isinstance(matchAgainst, collections.Callable): result = matchAgainst(markup) else: #Custom match methods take the tag as an argument, but all diff --git a/tests/test_tree.py b/tests/test_tree.py index f2989fe..87a7e3a 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -307,16 +307,16 @@ class TestNextOperations(ProximityTest): self.start = self.tree.b def test_next(self): - self.assertEquals(self.start.next, "One") - self.assertEquals(self.start.next.next['id'], "2") + self.assertEquals(self.start.next_element, "One") + self.assertEquals(self.start.next_element.next_element['id'], "2") def test_next_of_last_item_is_none(self): last = self.tree.find(text="Three") - self.assertEquals(last.next, None) + self.assertEquals(last.next_element, None) def test_next_of_root_is_none(self): # The document root is outside the next/previous chain. - self.assertEquals(self.tree.next, None) + self.assertEquals(self.tree.next_element, None) def test_find_all_next(self): self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"]) @@ -352,17 +352,17 @@ class TestPreviousOperations(ProximityTest): self.end = self.tree.find(text="Three") def test_previous(self): - self.assertEquals(self.end.previous['id'], "3") - self.assertEquals(self.end.previous.previous, "Two") + self.assertEquals(self.end.previous_element['id'], "3") + self.assertEquals(self.end.previous_element.previous_element, "Two") def test_previous_of_first_item_is_none(self): first = self.tree.find('html') - self.assertEquals(first.previous, None) + self.assertEquals(first.previous_element, None) def test_previous_of_root_is_none(self): # The document root is outside the next/previous chain. # XXX This is broken! - #self.assertEquals(self.tree.previous, None) + #self.assertEquals(self.tree.previous_element, None) pass def test_find_all_previous(self): @@ -436,7 +436,7 @@ class TestNextSibling(SiblingTest): self.assertEquals(self.start.nextSibling.nextSibling['id'], '3') # Note the difference between nextSibling and next. - self.assertEquals(self.start.next['id'], '1.1') + self.assertEquals(self.start.next_element['id'], '1.1') def test_next_sibling_may_not_exist(self): self.assertEquals(self.tree.html.nextSibling, None) @@ -481,7 +481,7 @@ class TestPreviousSibling(SiblingTest): self.assertEquals(self.end.previousSibling.previousSibling['id'], '2') # Note the difference between previousSibling and previous. - self.assertEquals(self.end.previous['id'], '3.1') + self.assertEquals(self.end.previous_element['id'], '3.1') def test_previous_sibling_may_not_exist(self): self.assertEquals(self.tree.html.previousSibling, None) @@ -565,10 +565,10 @@ class TestTreeModification(SoupTest): soup.find(text="Argh!").replace_with("Hooray!") new_text = soup.find(text="Hooray!") b = soup.b - self.assertEqual(new_text.previous, b) + self.assertEqual(new_text.previous_element, b) self.assertEqual(new_text.parent, b) - self.assertEqual(new_text.previous.next, new_text) - self.assertEqual(new_text.next, None) + self.assertEqual(new_text.previous_element.next_element, new_text) + self.assertEqual(new_text.next_element, None) def test_consecutive_text_nodes(self): # A builder should never create two consecutive text nodes, @@ -582,14 +582,14 @@ class TestTreeModification(SoupTest): "Argh!Hooray!")) new_text = soup.find(text="Hooray!") - self.assertEqual(new_text.previous, "Argh!") - self.assertEqual(new_text.previous.next, new_text) + self.assertEqual(new_text.previous_element, "Argh!") + self.assertEqual(new_text.previous_element.next_element, new_text) self.assertEqual(new_text.previousSibling, "Argh!") self.assertEqual(new_text.previousSibling.nextSibling, new_text) self.assertEqual(new_text.nextSibling, None) - self.assertEqual(new_text.next, soup.c) + self.assertEqual(new_text.next_element, soup.c) def test_insert_tag(self): @@ -610,8 +610,8 @@ class TestTreeModification(SoupTest): self.assertEqual(magic_tag.previousSibling, b_tag) find = b_tag.find(text="Find") - self.assertEqual(find.next, magic_tag) - self.assertEqual(magic_tag.previous, find) + self.assertEqual(find.next_element, magic_tag) + self.assertEqual(magic_tag.previous_element, find) c_tag = soup.c self.assertEqual(magic_tag.nextSibling, c_tag) @@ -619,8 +619,8 @@ class TestTreeModification(SoupTest): the = magic_tag.find(text="the") self.assertEqual(the.parent, magic_tag) - self.assertEqual(the.next, c_tag) - self.assertEqual(c_tag.previous, the) + self.assertEqual(the.next_element, c_tag) + self.assertEqual(c_tag.previous_element, the) def test_insert_works_on_empty_element_tag(self): # This is a little strange, since most HTML parsers don't allow @@ -643,7 +643,7 @@ class TestTreeModification(SoupTest): self.assertEquals(show.parent, None) self.assertEquals(no.parent, soup.p) - self.assertEquals(no.next, "no") + self.assertEquals(no.next_element, "no") self.assertEquals(no.nextSibling, " business") def test_nested_tag_replace_with(self): @@ -662,24 +662,24 @@ class TestTreeModification(SoupTest): # The tag is now an orphan. self.assertEqual(remove_tag.parent, None) - self.assertEqual(remove_tag.find(text="right").next, None) - self.assertEqual(remove_tag.previous, None) + self.assertEqual(remove_tag.find(text="right").next_element, None) + self.assertEqual(remove_tag.previous_element, None) self.assertEqual(remove_tag.nextSibling, None) self.assertEqual(remove_tag.previousSibling, None) # The tag is now connected to the tag. self.assertEqual(move_tag.parent, soup.a) - self.assertEqual(move_tag.previous, "We") - self.assertEqual(move_tag.next.next, soup.e) + self.assertEqual(move_tag.previous_element, "We") + self.assertEqual(move_tag.next_element.next_element, soup.e) self.assertEqual(move_tag.nextSibling, None) # The gap where the tag used to be has been mended, and # the word "to" is now connected to the tag. to_text = soup.find(text="to") g_tag = soup.g - self.assertEqual(to_text.next, g_tag) + self.assertEqual(to_text.next_element, g_tag) self.assertEqual(to_text.nextSibling, g_tag) - self.assertEqual(g_tag.previous, to_text) + self.assertEqual(g_tag.previous_element, to_text) self.assertEqual(g_tag.previousSibling, to_text) def test_extract(self): @@ -696,15 +696,15 @@ class TestTreeModification(SoupTest): # The extracted tag is now an orphan. self.assertEqual(len(soup.body.contents), 2) self.assertEqual(extracted.parent, None) - self.assertEqual(extracted.previous, None) - self.assertEqual(extracted.next.next, None) + self.assertEqual(extracted.previous_element, None) + self.assertEqual(extracted.next_element.next_element, None) # The gap where the extracted tag used to be has been mended. content_1 = soup.find(text="Some content. ") content_2 = soup.find(text=" More content.") - self.assertEquals(content_1.next, content_2) + self.assertEquals(content_1.next_element, content_2) self.assertEquals(content_1.nextSibling, content_2) - self.assertEquals(content_2.previous, content_1) + self.assertEquals(content_2.previous_element, content_1) self.assertEquals(content_2.previousSibling, content_1) -- cgit v1.2.3 From 7103a5f5ebcf655f9f8288eb54663b2485e197a9 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 21 May 2011 12:00:16 -0400 Subject: More Python 3 compatibility. --- bs4/element.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/bs4/element.py b/bs4/element.py index 729ab36..c7dbd6b 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -75,8 +75,7 @@ class PageElement(object): def insert(self, position, newChild): if (isinstance(newChild, basestring) - or isinstance(newChild, unicode)) \ - and not isinstance(newChild, NavigableString): + and not isinstance(newChild, NavigableString)): newChild = NavigableString(newChild) position = min(position, len(self.contents)) @@ -248,7 +247,7 @@ class PageElement(object): results = ResultSet(strainer) while True: try: - i = generator.next() + i = next(generator) except StopIteration: break if i: @@ -346,7 +345,9 @@ class NavigableString(unicode, PageElement): if attr == 'string': return self else: - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) + raise AttributeError( + "'%s' object has no attribute '%s'" % ( + self.__class__.__name__, attr)) def output_ready(self, substitute_html_entities=False): if substitute_html_entities: @@ -464,7 +465,7 @@ class Tag(PageElement): return self.attrs.get(key, default) def has_key(self, key): - return self.attrs.has_key(key) + return key in self.attrs def __getitem__(self, key): """tag[key] returns the value of the 'key' attribute for the tag, @@ -493,7 +494,7 @@ class Tag(PageElement): def __delitem__(self, key): "Deleting tag[key] deletes all 'key' attributes for the tag." - if self.attrs.has_key(key): + if key in self.attrs: del self.attrs[key] def __call__(self, *args, **kwargs): -- cgit v1.2.3 From f2f5df1563c3861a1f28bcfc0532d2e54de50cab Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 21 May 2011 12:01:25 -0400 Subject: More Python 3 compatibility. --- bs4/element.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bs4/element.py b/bs4/element.py index c7dbd6b..a10e615 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -501,7 +501,7 @@ class Tag(PageElement): """Calling a tag like a function is the same as calling its find_all() method. Eg. tag('a') returns a list of all the A tags found within this tag.""" - return apply(self.find_all, args, kwargs) + return self.find_all(args, kwargs) def __getattr__(self, tag): #print "Getattr %s.%s" % (self.__class__, tag) @@ -509,7 +509,8 @@ class Tag(PageElement): return self.find(tag[:-3]) elif tag.find('__') != 0: return self.find(tag) - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) + raise AttributeError( + "'%s' object has no attribute '%s'" % (self.__class__, tag)) def __eq__(self, other): """Returns true iff this tag has the same name, the same attributes, -- cgit v1.2.3 From 60b789089df25026b48d0a63b054bfa1e347aac9 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 21 May 2011 12:04:42 -0400 Subject: More Python 3 compatibility. --- bs4/element.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/bs4/element.py b/bs4/element.py index a10e615..fb768d1 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1,6 +1,5 @@ import collections import re -import types from bs4.dammit import EntitySubstitution DEFAULT_OUTPUT_ENCODING = "utf-8" @@ -753,7 +752,7 @@ class SoupStrainer(object): else: match = True markupAttrMap = None - for attr, matchAgainst in self.attrs.items(): + for attr, matchAgainst in list(self.attrs.items()): if not markupAttrMap: if hasattr(markupAttrs, 'get'): markupAttrMap = markupAttrs @@ -794,14 +793,14 @@ class SoupStrainer(object): if self._matches(markup, self.text): found = markup else: - raise Exception, "I don't know how to match against a %s" \ - % markup.__class__ + raise Exception( + "I don't know how to match against a %s" % markup.__class__) return found def _matches(self, markup, matchAgainst): #print "Matching %s against %s" % (markup, matchAgainst) result = False - if matchAgainst == True and type(matchAgainst) == types.BooleanType: + if matchAgainst == True and isinstance(matchAgainst, bool): result = markup != None elif isinstance(matchAgainst, collections.Callable): result = matchAgainst(markup) @@ -821,7 +820,7 @@ class SoupStrainer(object): or not isinstance(matchAgainst, basestring))): result = markup in matchAgainst elif hasattr(matchAgainst, 'items'): - result = markup.has_key(matchAgainst) + result = matchAgainst in markup elif matchAgainst and isinstance(markup, basestring): if isinstance(markup, unicode): matchAgainst = unicode(matchAgainst) -- cgit v1.2.3 From 581e6d941036081ad7d09d51b3469eb8de891e09 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 21 May 2011 12:06:15 -0400 Subject: More Python 3 compatibility. --- bs4/element.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/bs4/element.py b/bs4/element.py index fb768d1..10377e9 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -822,10 +822,7 @@ class SoupStrainer(object): elif hasattr(matchAgainst, 'items'): result = matchAgainst in markup elif matchAgainst and isinstance(markup, basestring): - if isinstance(markup, unicode): - matchAgainst = unicode(matchAgainst) - else: - matchAgainst = str(matchAgainst) + matchAgainst = markup.__class__(matchAgainst) if not result: result = matchAgainst == markup -- cgit v1.2.3 From fc53a81aa12a520af7144f17796c5d74c5aaff0a Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 21 May 2011 12:08:01 -0400 Subject: More Python 3 compatibility. --- bs4/builder/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index afd49b9..222eb5b 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -144,7 +144,7 @@ class SAXTreeBuilder(TreeBuilder): pass def startElement(self, name, attrs): - attrs = dict((key[1], value) for key, value in attrs.items()) + attrs = dict((key[1], value) for key, value in list(attrs.items())) #print "Start %s, %r" % (name, attrs) self.soup.handle_starttag(name, attrs) @@ -247,16 +247,16 @@ def register_treebuilders_from(module): # builder registrations will take precedence. In general, we want # html5lib to take precedence over lxml, because it's more # reliable. And we only want to use HTMLParser as a last result. -import _htmlparser +from . import _htmlparser register_treebuilders_from(_htmlparser) try: - import _lxml + from . import _lxml register_treebuilders_from(_lxml) except ImportError: # They don't have lxml installed. pass try: - import _html5lib + from . import _html5lib register_treebuilders_from(_html5lib) except ImportError: # They don't have html5lib installed. -- cgit v1.2.3 From 60630ce1ccd988bd449394c68d6eb817832c6e54 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 21 May 2011 12:13:26 -0400 Subject: More Python 3 compatibility. --- bs4/builder/__init__.py | 2 +- bs4/builder/_html5lib.py | 10 +++++----- bs4/builder/_lxml.py | 3 ++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 222eb5b..e6d4fa1 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -247,7 +247,7 @@ def register_treebuilders_from(module): # builder registrations will take precedence. In general, we want # html5lib to take precedence over lxml, because it's more # reliable. And we only want to use HTMLParser as a last result. -from . import _htmlparser +from .import _htmlparser register_treebuilders_from(_htmlparser) try: from . import _lxml diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index d74c4b0..e9d7f58 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -102,18 +102,18 @@ class AttrList(object): self.element = element self.attrs = dict(self.element.attrs) def __iter__(self): - return self.attrs.items().__iter__() + return list(self.attrs.items()).__iter__() def __setitem__(self, name, value): "set attr", name, value self.element[name] = value def items(self): - return self.attrs.items() + return list(self.attrs.items()) def keys(self): - return self.attrs.keys() + return list(self.attrs.keys()) def __getitem__(self, name): return self.attrs[name] def __contains__(self, name): - return name in self.attrs.keys() + return name in list(self.attrs.keys()) class Element(html5lib.treebuilders._base.Node): @@ -155,7 +155,7 @@ class Element(html5lib.treebuilders._base.Node): def setAttributes(self, attributes): if attributes is not None and attributes != {}: - for name, value in attributes.items(): + for name, value in list(attributes.items()): self.element[name] = value # The attributes may contain variables that need substitution. # Call set_up_substitutions manually. diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 57798f6..07b2032 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -3,6 +3,7 @@ __all__ = [ 'LXMLTreeBuilder', ] +import collections from lxml import etree from bs4.element import Comment, Doctype from bs4.builder import ( @@ -36,7 +37,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): if parser is None: # Use the default parser. parser = self.default_parser - if callable(parser): + if isinstance(parser, collections.Callable): # Instantiate the parser with default arguments parser = parser(target=self, strip_cdata=False) self.parser = parser -- cgit v1.2.3 From 728ff36cd480a02c562976c5279e5a41240c1bfb Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 21 May 2011 12:26:35 -0400 Subject: Deprecated has_key in favor of has_attr. --- CHANGELOG | 7 +++++++ bs4/element.py | 6 +++++- tests/test_tree.py | 18 ++++++++++++------ 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index a636544..4d1d075 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -35,6 +35,13 @@ work. Here are the renames: * findPreviousSibling -> find_previous_sibling * findPreviousSiblings -> find_previous_siblings +Methods have been renamed for compatibility with Python 3. + + * Tag.has_key() -> Tag.has_attr() + + (This was misleading, anyway, because has_key() looked at + a tag's attributes and __in__ looked at a tag's contents.) + Some attributes have also been renamed: * Tag.isSelfClosing -> Tag.is_empty_element diff --git a/bs4/element.py b/bs4/element.py index 10377e9..a9814e1 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -463,7 +463,7 @@ class Tag(PageElement): attribute.""" return self.attrs.get(key, default) - def has_key(self, key): + def has_attr(self, key): return key in self.attrs def __getitem__(self, key): @@ -707,6 +707,10 @@ class Tag(PageElement): def recursiveChildGenerator(self): return self.recursive_children + # This was kind of misleading because has_key() (attributes) was + # different from __in__ (contents). has_key() is gone in Python 3, + # anyway. + has_key = has_attr # Next, a couple classes to represent queries and their results. class SoupStrainer(object): diff --git a/tests/test_tree.py b/tests/test_tree.py index 87a7e3a..cf14d0c 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -10,7 +10,7 @@ methods tested here. """ import copy -import cPickle as pickle +import pickle import re from bs4 import BeautifulSoup from bs4.builder import builder_registry @@ -288,7 +288,7 @@ class TestParentOperations(TreeTest): def test_parent_generator(self): parents = [parent['id'] for parent in self.start.parents - if parent is not None and parent.has_key('id')] + if parent is not None and 'id' in parent.attrs] self.assertEquals(parents, ['bottom', 'middle', 'top']) @@ -735,11 +735,17 @@ class TestElementObjects(SoupTest): self.assertEqual(soup.a, None) self.assertEqual(soup.aTag, None) - def test_has_key(self): - """has_key() checks for the presence of an attribute.""" + def test_has_attr(self): + """has_attr() checks for the presence of an attribute. + + Please note note: has_attr() is different from + __in__. has_attr() checks the tag's attributes and __in__ + checks the tag's chidlren. + """ soup = self.soup("") - self.assertTrue(soup.foo.has_key('attr')) - self.assertFalse(soup.foo.has_key('attr2')) + self.assertTrue(soup.foo.has_attr('attr')) + self.assertFalse(soup.foo.has_attr('attr2')) + def test_attributes_come_out_in_alphabetical_order(self): markup = '' -- cgit v1.2.3