From e2cdc424ee48a38a6217f3eb0ca6adf513694a84 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 10 Apr 2009 11:22:53 -0400 Subject: First attempt at an import. --- src/beautifulsoup/AUTHORS | 34 ++ src/beautifulsoup/CHANGELOG | 122 ++++ src/beautifulsoup/README | 7 + src/beautifulsoup/TODO | 42 ++ src/beautifulsoup/__init__.py | 368 ++++++++++++ src/beautifulsoup/builder.py | 484 ++++++++++++++++ src/beautifulsoup/builder.py.3.diff | 4 + src/beautifulsoup/dammit.py | 292 ++++++++++ src/beautifulsoup/dammit.py.3.diff | 70 +++ src/beautifulsoup/docs/__init__.py | 16 + src/beautifulsoup/element.py | 870 ++++++++++++++++++++++++++++ src/beautifulsoup/element.py.3.diff | 8 + src/beautifulsoup/tests/__init__.py | 16 + src/beautifulsoup/tests/test_docs.py | 51 ++ src/beautifulsoup/tests/test_docs.py.3.diff | 122 ++++ src/beautifulsoup/tests/test_soup.py | 854 +++++++++++++++++++++++++++ src/beautifulsoup/util.py | 29 + 17 files changed, 3389 insertions(+) create mode 100644 src/beautifulsoup/AUTHORS create mode 100644 src/beautifulsoup/CHANGELOG create mode 100644 src/beautifulsoup/README create mode 100644 src/beautifulsoup/TODO create mode 100644 src/beautifulsoup/__init__.py create mode 100644 src/beautifulsoup/builder.py create mode 100644 src/beautifulsoup/builder.py.3.diff create mode 100644 src/beautifulsoup/dammit.py create mode 100644 src/beautifulsoup/dammit.py.3.diff create mode 100644 src/beautifulsoup/docs/__init__.py create mode 100644 src/beautifulsoup/element.py create mode 100644 src/beautifulsoup/element.py.3.diff create mode 100644 src/beautifulsoup/tests/__init__.py create mode 100644 src/beautifulsoup/tests/test_docs.py create mode 100644 src/beautifulsoup/tests/test_docs.py.3.diff create mode 100644 src/beautifulsoup/tests/test_soup.py create mode 100644 src/beautifulsoup/util.py (limited to 'src') diff --git a/src/beautifulsoup/AUTHORS b/src/beautifulsoup/AUTHORS new file mode 100644 index 0000000..d353253 --- /dev/null +++ b/src/beautifulsoup/AUTHORS @@ -0,0 +1,34 @@ +Behold, mortal, the origins of Beautiful Soup... +================================================ + +Leonard Richardson is the primary programmer. + +Sam Ruby helps with a lot of edge cases. + +Mark Pilgrim provided the encoding detection code that forms the base +of UnicodeDammit. + +Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his +work in solving the nestable tags conundrum. + +The following people have contributed patches to Beautiful Soup: + + Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang, + Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris + Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren, + Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed + Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko + Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn + Webster, Paul Wright, Danny Yoo + +The following people made suggestions or found bugs or found ways to +break Beautiful Soup: + + Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Matt Ernst, + Michael Foord, Tom Harris, Bill de hOra, Donald Howes, Matt + Patterson, Scott Roberts, Steve Strassmann, Mike Williams, warchild + at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison, Joren Mc, + Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed Summers, + Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart Turner, Greg + Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de Sousa Rocha, + Yichun Wei, Per Vognsen diff --git a/src/beautifulsoup/CHANGELOG b/src/beautifulsoup/CHANGELOG new file mode 100644 index 0000000..4e97e1b --- /dev/null +++ b/src/beautifulsoup/CHANGELOG @@ -0,0 +1,122 @@ += 3.1.0 = + +A hybrid version that supports 2.4 and can be automatically converted +to run under Python 3.0. There are three backwards-incompatible +changes you should be aware of, but no new features or deliberate +behavior changes. + +1. str() may no longer do what you want. This is because the meaning +of str() inverts between Python 2 and 3; in Python 2 it gives you a +byte string, in Python 3 it gives you a Unicode string. + +The effect of this is that you can't pass an encoding to .__str__ +anymore. Use encode() to get a string and decode() to get Unicode, and +you'll be ready (well, readier) for Python 3. + +2. Beautiful Soup is now based on HTMLParser rather than SGMLParser, +which is gone in Python 3. There's some bad HTML that SGMLParser +handled but HTMLParser doesn't, usually to do with attribute values +that aren't closed or have brackets inside them: + + baz + ', '"> + +A later version of Beautiful Soup will allow you to plug in different +parsers to make tradeoffs between speed and the ability to handle bad +HTML. + +3. In Python 3 (but not Python 2),HTMLParser converts entities within +attributes to the corresponding Unicode characters. In Python 2 it's +possible to parse this string and leave the é intact. + + + +In Python 3, the é is always converted to \xe9 during +parsing. + + += 3.0.7a = + +Added an import that makes BS work in Python 2.3. + + += 3.0.7 = + +Fixed a UnicodeDecodeError when unpickling documents that contain +non-ASCII characters. + +Fixed a TypeError that occured in some circumstances when a tag +contained no text. + +Jump through hoops to avoid the use of chardet, which can be extremely +slow in some circumstances. UTF-8 documents should never trigger the +use of chardet. + +Whitespace is preserved inside
 and "
+        soup = BeautifulSoup(text)
+        self.assertEqual(soup.textarea.contents[0],
+                         "This is an example of an HTML tag<&<&")
+
+class OperatorOverload(SoupTest):
+    "Our operators do it all! Call now!"
+
+    def testTagNameAsFind(self):
+        "Tests that referencing a tag name as a member delegates to find()."
+        soup = BeautifulSoup('foobarRed herring')
+        self.assertEqual(soup.b.i, soup.find('b').find('i'))
+        self.assertEqual(soup.b.i.string, 'bar')
+        self.assertEqual(soup.b['id'], '1')
+        self.assertEqual(soup.b.contents[0], 'foo')
+        self.assert_(not soup.a)
+
+        #Test the .fooTag variant of .foo.
+        self.assertEqual(soup.bTag.iTag.string, 'bar')
+        self.assertEqual(soup.b.iTag.string, 'bar')
+        self.assertEqual(soup.find('b').find('i'), soup.bTag.iTag)
+
+class NestableEgg(SoupTest):
+    """Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!"""
+
+    def testParaInsideBlockquote(self):
+        soup = BeautifulSoup('

Foo

Bar') + self.assertEqual(soup.blockquote.p.b.string, 'Foo') + self.assertEqual(soup.blockquote.b.string, 'Foo') + self.assertEqual(soup.find('p', recursive=False).string, 'Bar') + + def testNestedTables(self): + text = """
Here's another table: +
Juicy text
""" + soup = BeautifulSoup(text) + self.assertEquals(soup.table.table.td.string, 'Juicy text') + self.assertEquals(len(soup.findAll('table')), 2) + self.assertEquals(len(soup.table.findAll('table')), 1) + self.assertEquals(soup.find('table', {'id' : 2}).parent.parent.parent.name, + 'table') + + text = "
Foo
" + soup = BeautifulSoup(text) + self.assertEquals(soup.table.tr.td.div.table.contents[0], "Foo") + + text = """FooBar + Baz
""" + soup = BeautifulSoup(text) + self.assertEquals(soup.table.thead.tr.contents[0], "Foo") + + def testBadNestedTables(self): + soup = BeautifulSoup("
") + self.assertEquals(soup.table.tr.table.tr['id'], 'nested') + +class CleanupOnAisleFour(SoupTest): + """Here we test cleanup of text that breaks HTMLParser or is just + obnoxious.""" + + def testSelfClosingtag(self): + self.assertEqual(BeautifulSoup("Foo
Bar").find('br').decode(), + '
') + + self.assertSoupEquals('

test1
test2

', + '

test1
test2

') + + text = '

test1test2' + soup = BeautifulStoneSoup(text) + self.assertEqual(soup.decode(), + '

test1test2

') + + builder = HTMLParserXMLTreeBuilder(selfClosingTags='selfclosing') + soup = BeautifulSoup(text, builder) + self.assertEqual(soup.decode(), + '

test1test2

') + + def testSelfClosingTagOrNot(self): + text = "http://foo.com/" + self.assertEqual(BeautifulStoneSoup(text).decode(), text) + self.assertEqual(BeautifulSoup(text).decode(), + 'http://foo.com/') + + def testBooleanAttributes(self): + text = "" + self.assertSoupEquals(text, text) + + def testCData(self): + xml = "foobar" + self.assertSoupEquals(xml, xml) + r = re.compile("foo.*bar") + soup = BeautifulSoup(xml) + self.assertEquals(soup.find(text=r).string, "foobar") + self.assertEquals(soup.find(text=r).__class__, CData) + + def testComments(self): + xml = "foobaz" + self.assertSoupEquals(xml) + r = re.compile("foo.*bar") + soup = BeautifulSoup(xml) + self.assertEquals(soup.find(text=r).string, "foobar") + self.assertEquals(soup.find(text="foobar").__class__, Comment) + + def testDeclaration(self): + xml = "foobaz" + self.assertSoupEquals(xml) + r = re.compile(".*foo.*bar") + soup = BeautifulSoup(xml) + text = "DOCTYPE foobar" + self.assertEquals(soup.find(text=r).string, text) + self.assertEquals(soup.find(text=text).__class__, Declaration) + + namespaced_doctype = ('' + 'foo') + soup = BeautifulSoup(namespaced_doctype) + self.assertEquals(soup.contents[0], + 'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"') + self.assertEquals(soup.html.contents[0], 'foo') + + def testEntityConversions(self): + text = "<<sacré bleu!>>" + soup = BeautifulStoneSoup(text) + self.assertSoupEquals(text) + + xmlEnt = Entities.XML_ENTITIES + htmlEnt = Entities.HTML_ENTITIES + xhtmlEnt = Entities.XHTML_ENTITIES + + xmlBuilder = HTMLParserXMLTreeBuilder(convertEntities=xmlEnt) + htmlBuilder = HTMLParserXMLTreeBuilder(convertEntities=htmlEnt) + xhtmlBuilder = HTMLParserXMLTreeBuilder(convertEntities=xhtmlEnt) + + soup = BeautifulStoneSoup(text, xmlBuilder) + self.assertEquals(soup.decode(), "<>") + + soup = BeautifulStoneSoup(text, xmlBuilder) + self.assertEquals(soup.decode(), "<>") + + soup = BeautifulStoneSoup(text, htmlBuilder) + self.assertEquals(soup.decode(), u"<>") + + # Make sure the "XML", "HTML", and "XHTML" settings work. + text = "<™'" + soup = BeautifulStoneSoup(text, xmlBuilder) + self.assertEquals(soup.decode(), u"<™'") + + soup = BeautifulStoneSoup(text, htmlBuilder) + self.assertEquals(soup.decode(), u"<\u2122'") + + soup = BeautifulStoneSoup(text, xhtmlBuilder) + self.assertEquals(soup.decode(), u"<\u2122'") + + def testNonBreakingSpaces(self): + builder = HTMLParserTreeBuilder( + convertEntities=BeautifulStoneSoup.HTML_ENTITIES) + soup = BeautifulSoup("  ", builder) + self.assertEquals(soup.decode(), u"\xa0\xa0") + + def testWhitespaceInDeclaration(self): + self.assertSoupEquals('', '') + + def testJunkInDeclaration(self): + self.assertSoupEquals('a', 'a') + + def testIncompleteDeclaration(self): + self.assertSoupEquals('ac') + + def testEntityReplacement(self): + self.assertSoupEquals('hello there') + + def testEntitiesInAttributeValues(self): + self.assertSoupEquals('', '', + encoding='utf-8') + self.assertSoupEquals('', '', + encoding='utf-8') + + builder = HTMLParserTreeBuilder(convertEntities=Entities.HTML_ENTITIES) + soup = BeautifulSoup('', builder) + self.assertEquals(soup.decode(), u'') + + uri = "http://crummy.com?sacré&bleu" + link = '' % uri + + soup = BeautifulSoup(link, builder) + self.assertEquals(soup.decode(), + link.replace("é", u"\xe9")) + + uri = "http://crummy.com?sacré&bleu" + link = '' % uri + soup = BeautifulSoup(link, builder) + self.assertEquals(soup.a['href'], + uri.replace("é", u"\xe9")) + + def testNakedAmpersands(self): + builder = HTMLParserXMLTreeBuilder(convertEntities=Entities.HTML_ENTITIES) + soup = BeautifulStoneSoup("AT&T ", builder) + self.assertEquals(soup.decode(), 'AT&T ') + + nakedAmpersandInASentence = "AT&T was Ma Bell" + soup = BeautifulStoneSoup(nakedAmpersandInASentence, builder) + self.assertEquals(soup.decode(), \ + nakedAmpersandInASentence.replace('&','&')) + + invalidURL = 'foo' + validURL = invalidURL.replace('&','&') + soup = BeautifulStoneSoup(invalidURL) + self.assertEquals(soup.decode(), validURL) + + soup = BeautifulStoneSoup(validURL) + self.assertEquals(soup.decode(), validURL) + + +class EncodeRed(SoupTest): + """Tests encoding conversion, Unicode conversion, and Microsoft + smart quote fixes.""" + + def testUnicodeDammitStandalone(self): + markup = "\x92" + dammit = UnicodeDammit(markup) + self.assertEquals(dammit.unicode, "") + + hebrew = "\xed\xe5\xec\xf9" + dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) + self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9') + self.assertEquals(dammit.originalEncoding, 'iso-8859-8') + + def testGarbageInGarbageOut(self): + ascii = "a" + asciiSoup = BeautifulStoneSoup(ascii) + self.assertEquals(ascii, asciiSoup.decode()) + + unicodeData = u"\u00FC" + utf8 = unicodeData.encode("utf-8") + self.assertEquals(utf8, '\xc3\xbc') + + unicodeSoup = BeautifulStoneSoup(unicodeData) + self.assertEquals(unicodeData, unicodeSoup.decode()) + self.assertEquals(unicodeSoup.foo.string, u'\u00FC') + + utf8Soup = BeautifulStoneSoup(utf8, fromEncoding='utf-8') + self.assertEquals(utf8, utf8Soup.encode('utf-8')) + self.assertEquals(utf8Soup.originalEncoding, "utf-8") + + utf8Soup = BeautifulStoneSoup(unicodeData) + self.assertEquals(utf8, utf8Soup.encode('utf-8')) + self.assertEquals(utf8Soup.originalEncoding, None) + + + def testHandleInvalidCodec(self): + for bad_encoding in ['.utf8', '...', 'utF---16.!']: + soup = BeautifulSoup(u"RäksmörgÃ¥s".encode("utf-8"), + fromEncoding=bad_encoding) + self.assertEquals(soup.originalEncoding, 'utf-8') + + def testUnicodeSearch(self): + html = u'

Räksmörgås

' + soup = BeautifulSoup(html) + self.assertEqual(soup.find(text=u'Räksmörgås'),u'Räksmörgås') + + def testRewrittenXMLHeader(self): + euc_jp = '\n\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n\n' + utf8 = "\n\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n\n" + soup = BeautifulStoneSoup(euc_jp) + if soup.originalEncoding != "euc-jp": + raise Exception("Test failed when parsing euc-jp document. " + "If you're running Python >=2.4, or you have " + "cjkcodecs installed, this is a real problem. " + "Otherwise, ignore it.") + + self.assertEquals(soup.originalEncoding, "euc-jp") + self.assertEquals(soup.renderContents('utf-8'), utf8) + + old_text = "\x92" + new_text = "" + self.assertSoupEquals(old_text, new_text) + + def testRewrittenMetaTag(self): + no_shift_jis_html = '''\n
\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n
''' + soup = BeautifulSoup(no_shift_jis_html) + + # Beautiful Soup used to try to rewrite the meta tag even if the + # meta tag got filtered out by the strainer. This test makes + # sure that doesn't happen. + strainer = SoupStrainer('pre') + soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer) + self.assertEquals(soup.contents[0].name, 'pre') + + meta_tag = ('') + shift_jis_html = ( + '\n%s\n' + '' + '
\n'
+            '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
+            '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
+            '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n'
+            '
') % meta_tag + soup = BeautifulSoup(shift_jis_html) + if soup.originalEncoding != "shift-jis": + raise Exception("Test failed when parsing shift-jis document " + "with meta tag '%s'." + "If you're running Python >=2.4, or you have " + "cjkcodecs installed, this is a real problem. " + "Otherwise, ignore it." % meta_tag) + self.assertEquals(soup.originalEncoding, "shift-jis") + + content_type_tag = soup.meta['content'] + self.assertEquals(content_type_tag[content_type_tag.find('charset='):], + 'charset=%SOUP-ENCODING%') + content_type = str(soup.meta) + index = content_type.find('charset=') + self.assertEqual(content_type[index:index+len('charset=utf8')+1], + 'charset=utf-8') + content_type = soup.meta.encode('shift-jis') + index = content_type.find('charset=') + self.assertEqual(content_type[index:index+len('charset=shift-jis')], + 'charset=shift-jis'.encode()) + + self.assertEquals(soup.encode('utf-8'), ( + '\n' + '\n' + '' + '
\n'
+                '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3'
+                '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3'
+                '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6'
+                '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3'
+                '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n'
+                '
')) + self.assertEquals(soup.encode("shift-jis"), + shift_jis_html.replace('x-sjis'.encode(), + 'shift-jis'.encode())) + + isolatin = """Sacr\xe9 bleu!""" + soup = BeautifulSoup(isolatin) + + utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) + utf8 = utf8.replace("\xe9", "\xc3\xa9") + self.assertSoupEquals(soup.encode("utf-8"), utf8, encoding='utf-8') + + def testHebrew(self): + iso_8859_8= '\nHebrew (ISO 8859-8) in Visual Directionality\n\n\n\n\n\n

Hebrew (ISO 8859-8) in Visual Directionality

\n\xed\xe5\xec\xf9\n\n' + utf8 = '\nHebrew (ISO 8859-8) in Visual Directionality\n\n\n

Hebrew (ISO 8859-8) in Visual Directionality

\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n\n' + soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8") + self.assertEquals(soup.encode('utf-8'), utf8) + + def testSmartQuotesNotSoSmartAnymore(self): + self.assertSoupEquals("\x91Foo\x92 ", + '‘Foo’ ') + + def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self): + smartQuotes = "Il a dit, \x8BSacré bleu!\x9b" + soup = BeautifulSoup(smartQuotes) + self.assertEquals(soup.decode(), + 'Il a dit, ‹Sacré bleu!›') + builder = HTMLParserTreeBuilder(convertEntities="html") + soup = BeautifulSoup(smartQuotes, builder) + self.assertEquals(soup.encode('utf-8'), + 'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') + + def testDontSeeSmartQuotesWhereThereAreNone(self): + utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch" + self.assertSoupEquals(utf_8, encoding='utf-8') + + +class Whitewash(SoupTest): + """Test whitespace preservation.""" + + def testPreservedWhitespace(self): + self.assertSoupEquals("
   
") + self.assertSoupEquals("
 woo  
") + + def testCollapsedWhitespace(self): + self.assertSoupEquals("

", "

") + + +class AlternateBuilders(SoupTest): + """Test alternate builders.""" + + def testICantBelieveItsValidHTML(self): + builder = ICantBelieveItsValidHTMLTreeBuilder() + markup = "FooBar" + + soup = BeautifulSoup(markup) + self.assertEquals(soup.decode(), "FooBar") + + soup = BeautifulSoup(markup, builder=builder) + self.assertEquals(soup.decode(), markup) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/beautifulsoup/util.py b/src/beautifulsoup/util.py new file mode 100644 index 0000000..693a7e2 --- /dev/null +++ b/src/beautifulsoup/util.py @@ -0,0 +1,29 @@ +# Helper functions and mixin classes for Beautiful Soup + +import types +try: + set +except NameError: + from sets import Set as set + +def isList(l): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is listlike.""" + return ((hasattr(l, '__iter__') and not isString(l)) + or (type(l) in (types.ListType, types.TupleType))) + +def isString(s): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is stringlike.""" + try: + return isinstance(s, unicode) or isinstance(s, basestring) + except NameError: + return isinstance(s, str) + +def buildSet(args=None): + """Turns a list or a string into a set.""" + if isinstance(args, str): + return set([args]) + if args is None: + return set() + return set(args) -- cgit v1.2.3
foo