From 7bbefa1fcc9a6006953eb0a79049ece9f05985de Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 28 Jan 2011 11:39:36 -0500 Subject: Moved everything into the top-level directory and got rid of buildout. --- AUTHORS | 34 ++ CHANGELOG | 122 ++++++ README.txt | 26 ++ TODO | 45 +++ __init__.py | 375 ++++++++++++++++++ _bootstrap/COPYRIGHT.txt | 9 - _bootstrap/LICENSE.txt | 54 --- _bootstrap/bootstrap.py | 77 ---- bootstrap.py | 1 - buildout.cfg | 31 -- dammit.py | 292 ++++++++++++++ element.py | 851 +++++++++++++++++++++++++++++++++++++++++ lxml_test.py | 13 - setup.py | 44 --- src/beautifulsoup/AUTHORS | 34 -- src/beautifulsoup/CHANGELOG | 122 ------ src/beautifulsoup/README.txt | 26 -- src/beautifulsoup/TODO | 42 -- src/beautifulsoup/__init__.py | 375 ------------------ src/beautifulsoup/dammit.py | 292 -------------- src/beautifulsoup/element.py | 851 ----------------------------------------- src/beautifulsoup/python3.diff | 208 ---------- src/beautifulsoup/testing.py | 154 -------- src/beautifulsoup/util.py | 29 -- testall.sh | 2 - testing.py | 154 ++++++++ to3.sh | 9 - util.py | 29 ++ 28 files changed, 1928 insertions(+), 2373 deletions(-) create mode 100644 AUTHORS create mode 100644 CHANGELOG create mode 100644 README.txt create mode 100644 __init__.py delete mode 100644 _bootstrap/COPYRIGHT.txt delete mode 100644 _bootstrap/LICENSE.txt delete mode 100644 _bootstrap/bootstrap.py delete mode 120000 bootstrap.py delete mode 100644 buildout.cfg create mode 100644 dammit.py create mode 100644 element.py delete mode 100644 lxml_test.py delete mode 100644 setup.py delete mode 100644 src/beautifulsoup/AUTHORS delete mode 100644 src/beautifulsoup/CHANGELOG delete mode 100644 src/beautifulsoup/README.txt delete mode 100644 src/beautifulsoup/TODO delete mode 100644 src/beautifulsoup/__init__.py delete mode 100644 src/beautifulsoup/dammit.py delete mode 100644 src/beautifulsoup/element.py delete mode 100644 src/beautifulsoup/python3.diff delete mode 100644 src/beautifulsoup/testing.py delete mode 100644 src/beautifulsoup/util.py delete mode 100755 testall.sh create mode 100644 testing.py delete mode 100755 to3.sh create mode 100644 util.py diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..d353253 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,34 @@ +Behold, mortal, the origins of Beautiful Soup... +================================================ + +Leonard Richardson is the primary programmer. + +Sam Ruby helps with a lot of edge cases. + +Mark Pilgrim provided the encoding detection code that forms the base +of UnicodeDammit. + +Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his +work in solving the nestable tags conundrum. + +The following people have contributed patches to Beautiful Soup: + + Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang, + Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris + Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren, + Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed + Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko + Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn + Webster, Paul Wright, Danny Yoo + +The following people made suggestions or found bugs or found ways to +break Beautiful Soup: + + Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Matt Ernst, + Michael Foord, Tom Harris, Bill de hOra, Donald Howes, Matt + Patterson, Scott Roberts, Steve Strassmann, Mike Williams, warchild + at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison, Joren Mc, + Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed Summers, + Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart Turner, Greg + Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de Sousa Rocha, + Yichun Wei, Per Vognsen diff --git a/CHANGELOG b/CHANGELOG new file mode 100644 index 0000000..4e97e1b --- /dev/null +++ b/CHANGELOG @@ -0,0 +1,122 @@ += 3.1.0 = + +A hybrid version that supports 2.4 and can be automatically converted +to run under Python 3.0. There are three backwards-incompatible +changes you should be aware of, but no new features or deliberate +behavior changes. + +1. str() may no longer do what you want. This is because the meaning +of str() inverts between Python 2 and 3; in Python 2 it gives you a +byte string, in Python 3 it gives you a Unicode string. + +The effect of this is that you can't pass an encoding to .__str__ +anymore. Use encode() to get a string and decode() to get Unicode, and +you'll be ready (well, readier) for Python 3. + +2. Beautiful Soup is now based on HTMLParser rather than SGMLParser, +which is gone in Python 3. There's some bad HTML that SGMLParser +handled but HTMLParser doesn't, usually to do with attribute values +that aren't closed or have brackets inside them: + + baz + ', '"> + +A later version of Beautiful Soup will allow you to plug in different +parsers to make tradeoffs between speed and the ability to handle bad +HTML. + +3. In Python 3 (but not Python 2),HTMLParser converts entities within +attributes to the corresponding Unicode characters. In Python 2 it's +possible to parse this string and leave the é intact. + + + +In Python 3, the é is always converted to \xe9 during +parsing. + + += 3.0.7a = + +Added an import that makes BS work in Python 2.3. + + += 3.0.7 = + +Fixed a UnicodeDecodeError when unpickling documents that contain +non-ASCII characters. + +Fixed a TypeError that occured in some circumstances when a tag +contained no text. + +Jump through hoops to avoid the use of chardet, which can be extremely +slow in some circumstances. UTF-8 documents should never trigger the +use of chardet. + +Whitespace is preserved inside
 and ")
-
-    def test_single_quote_attribute_values_become_double_quotes(self):
-        self.assertSoupEquals("",
-                              '')
-
-    def test_attribute_values_with_nested_quotes_are_left_alone(self):
-        text = """a"""
-        self.assertSoupEquals(text)
-
-    def test_attribute_values_with_double_nested_quotes_get_quoted(self):
-        text = """a"""
-        soup = self.soup(text)
-        soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
-        self.assertSoupEquals(
-            soup.foo.decode(),
-            """a""")
-
-    def test_ampersand_in_attribute_value_gets_quoted(self):
-        self.assertSoupEquals('',
-                              '')
-
-
-class BuilderInvalidMarkupSmokeTest(SoupTest):
-    """Tests of invalid markup.
-
-    These are very likely to give different results for different tree
-    builders. It's not required that a tree builder handle invalid
-    markup at all.
-    """
-
-    def test_unclosed_block_level_elements(self):
-        # Unclosed block-level elements should be closed.
-        self.assertSoupEquals(
-            '

Foo

Bar', - '

Foo

Bar

') - - def test_fake_self_closing_tag(self): - # If a self-closing tag presents as a normal tag, the 'open' - # tag is treated as an instance of the self-closing tag and - # the 'close' tag is ignored. - self.assertSoupEquals( - "http://foo.com/", - "http://foo.com/") - - def test_boolean_attribute_with_no_value_gets_empty_value(self): - soup = self.soup("
foo
") - self.assertEquals(soup.table.td['nowrap'], '') - - def test_incorrectly_nested_tables(self): - self.assertSoupEquals( - '
', - '
') - - - diff --git a/src/beautifulsoup/util.py b/src/beautifulsoup/util.py deleted file mode 100644 index 693a7e2..0000000 --- a/src/beautifulsoup/util.py +++ /dev/null @@ -1,29 +0,0 @@ -# Helper functions and mixin classes for Beautiful Soup - -import types -try: - set -except NameError: - from sets import Set as set - -def isList(l): - """Convenience method that works with all 2.x versions of Python - to determine whether or not something is listlike.""" - return ((hasattr(l, '__iter__') and not isString(l)) - or (type(l) in (types.ListType, types.TupleType))) - -def isString(s): - """Convenience method that works with all 2.x versions of Python - to determine whether or not something is stringlike.""" - try: - return isinstance(s, unicode) or isinstance(s, basestring) - except NameError: - return isinstance(s, str) - -def buildSet(args=None): - """Turns a list or a string into a set.""" - if isinstance(args, str): - return set([args]) - if args is None: - return set() - return set(args) diff --git a/testall.sh b/testall.sh deleted file mode 100755 index 801124f..0000000 --- a/testall.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -python BeautifulSoupTests.py && sh to3.sh && cd python3 && python3 BeautifulSoupTests.py diff --git a/testing.py b/testing.py new file mode 100644 index 0000000..20d087e --- /dev/null +++ b/testing.py @@ -0,0 +1,154 @@ +"""Helper classes for tests.""" + +import unittest +from beautifulsoup import BeautifulSoup +from beautifulsoup.element import Comment, SoupStrainer +from beautifulsoup.builder.lxml_builder import LXMLTreeBuilder + +class SoupTest(unittest.TestCase): + + def setUp(self): + # LXMLTreeBuilder won't handle bad markup, but that's fine, + # since all the parsing tests take place in parser-specific + # test suites that override default_builder. + self.default_builder = LXMLTreeBuilder() + + def soup(self, markup, **kwargs): + """Build a Beautiful Soup object from markup.""" + return BeautifulSoup(markup, builder=self.default_builder, **kwargs) + + def document_for(self, markup): + """Turn an HTML fragment into a document. + + The details depend on the builder. + """ + return self.default_builder.test_fragment_to_document(markup) + + def assertSoupEquals(self, to_parse, compare_parsed_to=None): + builder = self.default_builder + obj = BeautifulSoup(to_parse, builder=builder) + if compare_parsed_to is None: + compare_parsed_to = to_parse + + self.assertEquals(obj.decode(), self.document_for(compare_parsed_to)) + + + +class BuilderSmokeTest(SoupTest): + """A generic smoke test for tree builders. + + Subclasses of this test ensure that all of Beautiful Soup's tree + builders generate more or less the same trees. It's okay for trees + to differ, especially when given invalid markup--just override the + appropriate test method to demonstrate how one tree builder + differs from others. + """ + + def test_bare_string(self): + # A bare string is turned into some kind of HTML document or + # fragment recognizable as the original string. + self.assertSoupEquals("A bare string") + + def test_mixed_case_tags(self): + # Mixed-case tags are folded to lowercase. + self.assertSoupEquals( + "
", + "") + + def test_self_closing(self): + # HTML's self-closing tags are recognized as such. + self.assertSoupEquals( + "

A tag

", "

A tag

") + + self.assertSoupEquals( + "

Foo
bar

", "

Foo
bar

") + + def test_comment(self): + # Comments are represented as Comment objects. + markup = "

foobaz

" + self.assertSoupEquals(markup) + + soup = self.soup(markup) + comment = soup.find(text="foobar") + self.assertEquals(comment.__class__, Comment) + + def test_nested_inline_elements(self): + # Inline tags can be nested indefinitely. + b_tag = "Inside a B tag" + self.assertSoupEquals(b_tag) + + nested_b_tag = "

A nested tag

" + self.assertSoupEquals(nested_b_tag) + + double_nested_b_tag = "

A doubly nested tag

" + self.assertSoupEquals(nested_b_tag) + + def test_nested_block_level_elements(self): + soup = self.soup('

Foo

') + blockquote = soup.blockquote + self.assertEqual(blockquote.p.b.string, 'Foo') + self.assertEqual(blockquote.b.string, 'Foo') + + def test_collapsed_whitespace(self): + """In most tags, whitespace is collapsed.""" + self.assertSoupEquals("

", "

") + + def test_preserved_whitespace_in_pre_and_textarea(self): + """In
 and ")
+
+    def test_single_quote_attribute_values_become_double_quotes(self):
+        self.assertSoupEquals("",
+                              '')
+
+    def test_attribute_values_with_nested_quotes_are_left_alone(self):
+        text = """a"""
+        self.assertSoupEquals(text)
+
+    def test_attribute_values_with_double_nested_quotes_get_quoted(self):
+        text = """a"""
+        soup = self.soup(text)
+        soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
+        self.assertSoupEquals(
+            soup.foo.decode(),
+            """a""")
+
+    def test_ampersand_in_attribute_value_gets_quoted(self):
+        self.assertSoupEquals('',
+                              '')
+
+
+class BuilderInvalidMarkupSmokeTest(SoupTest):
+    """Tests of invalid markup.
+
+    These are very likely to give different results for different tree
+    builders. It's not required that a tree builder handle invalid
+    markup at all.
+    """
+
+    def test_unclosed_block_level_elements(self):
+        # Unclosed block-level elements should be closed.
+        self.assertSoupEquals(
+            '

Foo

Bar', + '

Foo

Bar

') + + def test_fake_self_closing_tag(self): + # If a self-closing tag presents as a normal tag, the 'open' + # tag is treated as an instance of the self-closing tag and + # the 'close' tag is ignored. + self.assertSoupEquals( + "http://foo.com/", + "http://foo.com/") + + def test_boolean_attribute_with_no_value_gets_empty_value(self): + soup = self.soup("
foo
") + self.assertEquals(soup.table.td['nowrap'], '') + + def test_incorrectly_nested_tables(self): + self.assertSoupEquals( + '
', + '
') + + + diff --git a/to3.sh b/to3.sh deleted file mode 100755 index 26b3246..0000000 --- a/to3.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/sh -mkdir python3 -for i in BeautifulSoupTests.py builder.py element.py dammit.py -do - cp $i python3/ - 2to3-3.0 -x next $i | patch -p0 python3/$i - cp python3/$i python3/$i.orig - patch -p0 python3/$i < $i.3.diff -done \ No newline at end of file diff --git a/util.py b/util.py new file mode 100644 index 0000000..693a7e2 --- /dev/null +++ b/util.py @@ -0,0 +1,29 @@ +# Helper functions and mixin classes for Beautiful Soup + +import types +try: + set +except NameError: + from sets import Set as set + +def isList(l): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is listlike.""" + return ((hasattr(l, '__iter__') and not isString(l)) + or (type(l) in (types.ListType, types.TupleType))) + +def isString(s): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is stringlike.""" + try: + return isinstance(s, unicode) or isinstance(s, basestring) + except NameError: + return isinstance(s, str) + +def buildSet(args=None): + """Turns a list or a string into a set.""" + if isinstance(args, str): + return set([args]) + if args is None: + return set() + return set(args) -- cgit v1.2.3