Beautiful Soup: We called him Tortoise because he taught us.

# -*- coding: utf-8 -*- """Unit tests for Beautiful Soup. These tests make sure the Beautiful Soup works as it should. If you find a bug in Beautiful Soup, the best way to express it is as a test case like this that fails.""" import re import unittest from beautifulsoup import * from beautifulsoup.element import CData, Comment, Declaration, SoupStrainer, Tag from beautifulsoup.builder import ICantBelieveItsValidHTMLTreeBuilder from beautifulsoup.dammit import UnicodeDammit def additional_tests(): return unittest.TestLoader().loadTestsFromName(__name__) class SoupTest(unittest.TestCase): default_builder = HTMLParserXMLTreeBuilder() def assertSoupEquals(self, toParse, rep=None, builder=None, encoding=None): """Parse the given text and make sure its string rep is the other given text.""" if rep == None: rep = toParse obj = BeautifulSoup(toParse, builder=self.default_builder) if encoding is None: rep2 = obj.decode() else: rep2 = obj.encode(encoding) self.assertEqual(rep2, rep) class FollowThatTag(SoupTest): "Tests the various ways of fetching tags from a soup." def setUp(self): ml = """ 1 2 3 4 4""" self.soup = BeautifulStoneSoup(ml) def testFindAllByName(self): matching = self.soup('a') self.assertEqual(len(matching), 2) self.assertEqual(matching[0].name, 'a') self.assertEqual(matching, self.soup.findAll('a')) self.assertEqual(matching, self.soup.findAll(SoupStrainer('a'))) def testFindAllByAttribute(self): matching = self.soup.findAll(id='x') self.assertEqual(len(matching), 2) self.assertEqual(matching[0].name, 'a') self.assertEqual(matching[1].name, 'b') matching2 = self.soup.findAll(attrs={'id' : 'x'}) self.assertEqual(matching, matching2) strainer = SoupStrainer(attrs={'id' : 'x'}) self.assertEqual(matching, self.soup.findAll(strainer)) self.assertEqual(len(self.soup.findAll(id=None)), 1) self.assertEqual(len(self.soup.findAll(width=100)), 1) self.assertEqual(len(self.soup.findAll(junk=None)), 5) self.assertEqual(len(self.soup.findAll(junk=[1, None])), 5) self.assertEqual(len(self.soup.findAll(junk=re.compile('.*'))), 0) self.assertEqual(len(self.soup.findAll(junk=True)), 0) self.assertEqual(len(self.soup.findAll(junk=True)), 0) self.assertEqual(len(self.soup.findAll(href=True)), 1) def testFindallByClass(self): soup = BeautifulSoup('Foo Bar') self.assertEqual(soup.find('a', '1').string, "Bar") def testFindAllByList(self): matching = self.soup(['a', 'ac']) self.assertEqual(len(matching), 3) def testFindAllByHash(self): matching = self.soup({'a' : True, 'b' : True}) self.assertEqual(len(matching), 4) def testFindAllText(self): soup = BeautifulSoup("\xbb") self.assertEqual(soup.findAll(text=re.compile('.*')), [u'\xbb']) def testFindAllByRE(self): import re r = re.compile('a.*') self.assertEqual(len(self.soup(r)), 3) def testFindAllByMethod(self): def matchTagWhereIDMatchesName(tag): return tag.name == tag.get('id') matching = self.soup.findAll(matchTagWhereIDMatchesName) self.assertEqual(len(matching), 2) self.assertEqual(matching[0].name, 'a') def testParents(self): soup = BeautifulSoup('
Blah') b = soup.b self.assertEquals(len(b.findParents('ul', {'id' : 'foo'})), 2) self.assertEquals(b.findParent('ul')['a'], 'b') PROXIMITY_TEST = BeautifulSoup('') def testNext(self): soup = self.PROXIMITY_TEST b = soup.find('b', {'id' : 2}) self.assertEquals(b.findNext('b')['id'], '3') self.assertEquals(b.findNext('b')['id'], '3') self.assertEquals(len(b.findAllNext('b')), 2) self.assertEquals(len(b.findAllNext('b', {'id' : 4})), 1) def testPrevious(self): soup = self.PROXIMITY_TEST b = soup.find('b', {'id' : 3}) self.assertEquals(b.findPrevious('b')['id'], '2') self.assertEquals(b.findPrevious('b')['id'], '2') self.assertEquals(len(b.findAllPrevious('b')), 2) self.assertEquals(len(b.findAllPrevious('b', {'id' : 2})), 1) SIBLING_TEST = BeautifulSoup('
') def testNextSibling(self): soup = self.SIBLING_TEST tag = 'blockquote' b = soup.find(tag, {'id' : 2}) self.assertEquals(b.findNext(tag)['id'], '2.1') self.assertEquals(b.findNextSibling(tag)['id'], '3') self.assertEquals(b.findNextSibling(tag)['id'], '3') self.assertEquals(len(b.findNextSiblings(tag)), 2) self.assertEquals(len(b.findNextSiblings(tag, {'id' : 4})), 1) def testPreviousSibling(self): soup = self.SIBLING_TEST tag = 'blockquote' b = soup.find(tag, {'id' : 3}) self.assertEquals(b.findPrevious(tag)['id'], '2.1') self.assertEquals(b.findPreviousSibling(tag)['id'], '2') self.assertEquals(b.findPreviousSibling(tag)['id'], '2') self.assertEquals(len(b.findPreviousSiblings(tag)), 2) self.assertEquals(len(b.findPreviousSiblings(tag, id=1)), 1) def testTextNavigation(self): soup = BeautifulSoup('FooBarBaz
Blee
Blargh') baz = soup.find(text='Baz') self.assertEquals(baz.findParent("i")['id'], '1') self.assertEquals(baz.findNext(text='Blee'), 'Blee') self.assertEquals(baz.findNextSibling(text='Blee'), 'Blee') self.assertEquals(baz.findNextSibling(text='Blargh'), None) self.assertEquals(baz.findNextSibling('hr')['id'], '1') class SiblingRivalry(SoupTest): "Tests the nextSibling and previousSibling navigation." def testSiblings(self): soup = BeautifulSoup("
1
A
B
2
3
") secondLI = soup.find('li').nextSibling self.assert_(secondLI.name == 'li' and secondLI.string == '2') self.assertEquals(soup.find(text='1').nextSibling.name, 'p') self.assertEquals(soup.find('p').nextSibling, 'B') self.assertEquals(soup.find('p').nextSibling.previousSibling.nextSibling, 'B') class TagsAreObjectsToo(SoupTest): "Tests the various built-in functions of Tag objects." def testLen(self): soup = BeautifulSoup("123") self.assertEquals(len(soup.top), 3) class StringEmUp(SoupTest): "Tests the use of 'string' as an alias for a tag's only content." def testString(self): s = BeautifulSoup("foo") self.assertEquals(s.b.string, 'foo') def testLackOfString(self): s = BeautifulSoup("feo") self.assert_(not s.b.string) class ThatsMyLimit(SoupTest): "Tests the limit argument." def testBasicLimits(self): s = BeautifulSoup('

') self.assertEquals(len(s.findAll('br')), 4) self.assertEquals(len(s.findAll('br', limit=2)), 2) self.assertEquals(len(s('br', limit=2)), 2) class OnlyTheLonely(SoupTest): "Tests the parseOnly argument to the constructor." def setUp(self): x = [] for i in range(1,6): x.append('' % i) for j in range(100,103): x.append('Content %s.%s' % (i,j, i,j)) x.append('') self.x = ''.join(x) def testOnly(self): strainer = SoupStrainer("b") soup = BeautifulSoup(self.x, parseOnlyThese=strainer) self.assertEquals(len(soup), 15) strainer = SoupStrainer(id=re.compile("100.*")) soup = BeautifulSoup(self.x, parseOnlyThese=strainer) self.assertEquals(len(soup), 5) strainer = SoupStrainer(text=re.compile("10[01].*")) soup = BeautifulSoup(self.x, parseOnlyThese=strainer) self.assertEquals(len(soup), 10) strainer = SoupStrainer(text=lambda(x):x[8]=='3') soup = BeautifulSoup(self.x, parseOnlyThese=strainer) self.assertEquals(len(soup), 3) class PickleMeThis(SoupTest): "Testing features like pickle and deepcopy." def setUp(self): self.page = """ Beautiful Soup: We called him Tortoise because he taught us. foo bar """ self.soup = BeautifulSoup(self.page) def testPickle(self): import pickle dumped = pickle.dumps(self.soup, 2) loaded = pickle.loads(dumped) self.assertEqual(loaded.__class__, BeautifulSoup) self.assertEqual(loaded.decode(), self.soup.decode()) def testDeepcopy(self): from copy import deepcopy deepcopy(BeautifulSoup("")) copied = deepcopy(self.soup) self.assertEqual(copied.decode(), self.soup.decode()) def testUnicodePickle(self): import cPickle as pickle html = "" + chr(0xc3) + "" soup = BeautifulSoup(html) dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) loaded = pickle.loads(dumped) self.assertEqual(loaded.decode(), soup.decode()) class WriteOnlyCode(SoupTest): "Testing the modification of the tree." def testModifyAttributes(self): soup = BeautifulSoup('') soup.a['id'] = 2 self.assertEqual(soup.decode(), '') del(soup.a['id']) self.assertEqual(soup.decode(), '') soup.a['id2'] = 'foo' self.assertEqual(soup.decode(), '') def testNewTagCreation(self): "Makes sure tags don't step on each others' toes." soup = BeautifulSoup() builder = HTMLParserTreeBuilder() a = Tag(soup, builder, 'a') ol = Tag(soup, builder, 'ol') a['href'] = 'http://foo.com/' self.assertRaises(KeyError, lambda : ol['href']) def testTagReplacement(self): # Make sure you can replace an element with itself. text = "Foo " soup = BeautifulSoup(text) c = soup.c soup.c.replaceWith(c) self.assertEquals(soup.decode(), text) # A very simple case soup = BeautifulSoup("Argh!") soup.find(text="Argh!").replaceWith("Hooray!") newText = soup.find(text="Hooray!") b = soup.b self.assertEqual(newText.previous, b) self.assertEqual(newText.parent, b) self.assertEqual(newText.previous.next, newText) self.assertEqual(newText.next, None) # A more complex case soup = BeautifulSoup("Argh!") soup.b.insert(1, "Hooray!") newText = soup.find(text="Hooray!") self.assertEqual(newText.previous, "Argh!") self.assertEqual(newText.previous.next, newText) self.assertEqual(newText.previousSibling, "Argh!") self.assertEqual(newText.previousSibling.nextSibling, newText) self.assertEqual(newText.nextSibling, None) self.assertEqual(newText.next, soup.c) text = "There's no business like show business" soup = BeautifulSoup(text) no, show = soup.findAll('b') show.replaceWith(no) self.assertEquals(soup.decode(), "There's business like no business") # Even more complex soup = BeautifulSoup("Findlady!") builder = HTMLParserTreeBuilder() tag = Tag(soup, builder, 'magictag') tag.insert(0, "the") soup.a.insert(1, tag) b = soup.b c = soup.c theText = tag.find(text=True) findText = b.find(text="Find") self.assertEqual(findText.next, tag) self.assertEqual(tag.previous, findText) self.assertEqual(b.nextSibling, tag) self.assertEqual(tag.previousSibling, b) self.assertEqual(tag.nextSibling, c) self.assertEqual(c.previousSibling, tag) self.assertEqual(theText.next, c) self.assertEqual(c.previous, theText) # Aand... incredibly complex. soup = BeautifulSoup("""Wereservetherighttorefuseservice""") f = soup.f a = soup.a c = soup.c e = soup.e weText = a.find(text="We") soup.b.replaceWith(soup.f) self.assertEqual(soup.decode(), "Werefusetoservice") self.assertEqual(f.previous, weText) self.assertEqual(weText.next, f) self.assertEqual(f.previousSibling, weText) self.assertEqual(f.nextSibling, None) self.assertEqual(weText.nextSibling, f) def testAppend(self): doc = "
Don't leave me here.

Don't leave me.
" soup = BeautifulSoup(doc) second_para = soup('p')[1] bold = soup.find('b') soup('p')[1].append(soup.find('b')) self.assertEqual(bold.parent, second_para) self.assertEqual(soup.decode(), "
Don't leave me .
" "
Don't leave me.here
") def testTagExtraction(self): # A very simple case text = '
Nav crap
Real content here.' soup = BeautifulSoup(text) extracted = soup.find("div", id="nav").extract() self.assertEqual(soup.decode(), "Real content here.") self.assertEqual(extracted.decode(), '
Nav crap
') # A simple case, a more complex test. text = "12 iii AB" soup = BeautifulStoneSoup(text) doc = soup.doc numbers, roman, letters = soup("a") self.assertEqual(roman.parent, doc) oldPrevious = roman.previous endOfThisTag = roman.nextSibling.previous self.assertEqual(oldPrevious, "2") self.assertEqual(roman.next, "i") self.assertEqual(endOfThisTag, "ii") self.assertEqual(roman.previousSibling, numbers) self.assertEqual(roman.nextSibling, letters) roman.extract() self.assertEqual(roman.parent, None) self.assertEqual(roman.previous, None) self.assertEqual(roman.next, "i") self.assertEqual(letters.previous, '2') self.assertEqual(roman.previousSibling, None) self.assertEqual(roman.nextSibling, None) self.assertEqual(endOfThisTag.next, None) self.assertEqual(roman.b.contents[0].next, None) self.assertEqual(numbers.nextSibling, letters) self.assertEqual(letters.previousSibling, numbers) self.assertEqual(len(doc.contents), 2) self.assertEqual(doc.contents[0], numbers) self.assertEqual(doc.contents[1], letters) # A more complex case. text = "12Hollywood, baby!3" soup = BeautifulStoneSoup(text) one = soup.find(text="1") three = soup.find(text="3") toExtract = soup.b soup.b.extract() self.assertEqual(one.next, three) self.assertEqual(three.previous, one) self.assertEqual(one.parent.nextSibling, three) self.assertEqual(three.previousSibling, soup.a) class TheManWithoutAttributes(SoupTest): "Test attribute access" def testHasKey(self): text = "" self.assertTrue(BeautifulSoup(text).foo.has_key('attr')) class QuoteMeOnThat(SoupTest): "Test quoting" def testQuotedAttributeValues(self): self.assertSoupEquals("", '') text = """a""" soup = BeautifulSoup(text) self.assertEquals(soup.decode(), text) soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' newText = """a""" self.assertSoupEquals(soup.decode(), newText) self.assertSoupEquals('', '') class YoureSoLiteral(SoupTest): "Test literal mode." def testLiteralMode(self): text = "Foo" soup = BeautifulSoup(text) self.assertEqual(soup.script.contents[0], "if (iThis is an example of an HTML tag<&<&") class OperatorOverload(SoupTest): "Our operators do it all! Call now!" def testTagNameAsFind(self): "Tests that referencing a tag name as a member delegates to find()." soup = BeautifulSoup('foobarRed herring') self.assertEqual(soup.b.i, soup.find('b').find('i')) self.assertEqual(soup.b.i.string, 'bar') self.assertEqual(soup.b['id'], '1') self.assertEqual(soup.b.contents[0], 'foo') self.assert_(not soup.a) #Test the .fooTag variant of .foo. self.assertEqual(soup.bTag.iTag.string, 'bar') self.assertEqual(soup.b.iTag.string, 'bar') self.assertEqual(soup.find('b').find('i'), soup.bTag.iTag) class NestableEgg(SoupTest): """Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!""" def testParaInsideBlockquote(self): soup = BeautifulSoup('
Foo
Bar') self.assertEqual(soup.blockquote.p.b.string, 'Foo') self.assertEqual(soup.blockquote.b.string, 'Foo') self.assertEqual(soup.find('p', recursive=False).string, 'Bar') def testNestedTables(self): text = """
Here's another table:
Juicy text
""" soup = BeautifulSoup(text) self.assertEquals(soup.table.table.td.string, 'Juicy text') self.assertEquals(len(soup.findAll('table')), 2) self.assertEquals(len(soup.table.findAll('table')), 1) self.assertEquals(soup.find('table', {'id' : 2}).parent.parent.parent.name, 'table') text = "
Foo
" soup = BeautifulSoup(text) self.assertEquals(soup.table.tr.td.div.table.contents[0], "Foo") text = """FooBarBaz

""" soup = BeautifulSoup(text) self.assertEquals(soup.table.thead.tr.contents[0], "Foo") def testBadNestedTables(self): soup = BeautifulSoup("
") self.assertEquals(soup.table.tr.table.tr['id'], 'nested') class CleanupOnAisleFour(SoupTest): """Here we test cleanup of text that breaks HTMLParser or is just obnoxious.""" def testSelfClosingtag(self): self.assertEqual(BeautifulSoup("Foo
Bar").find('br').decode(), '
') self.assertSoupEquals('
test1
test2
', '
test1
test2
') text = '
test1test2' soup = BeautifulStoneSoup(text) self.assertEqual(soup.decode(), '
test1test2
') builder = HTMLParserXMLTreeBuilder(selfClosingTags='selfclosing') soup = BeautifulSoup(text, builder) self.assertEqual(soup.decode(), '
test1test2
') def testSelfClosingTagOrNot(self): text = "http://foo.com/" self.assertEqual(BeautifulStoneSoup(text).decode(), text) self.assertEqual(BeautifulSoup(text).decode(), 'http://foo.com/') def testBooleanAttributes(self): text = "" self.assertSoupEquals(text, text) def testCData(self): xml = "foobar" self.assertSoupEquals(xml, xml) r = re.compile("foo.*bar") soup = BeautifulSoup(xml) self.assertEquals(soup.find(text=r).string, "foobar") self.assertEquals(soup.find(text=r).__class__, CData) def testComments(self): xml = "foobaz" self.assertSoupEquals(xml) r = re.compile("foo.*bar") soup = BeautifulSoup(xml) self.assertEquals(soup.find(text=r).string, "foobar") self.assertEquals(soup.find(text="foobar").__class__, Comment) def testDeclaration(self): xml = "foobaz" self.assertSoupEquals(xml) r = re.compile(".*foo.*bar") soup = BeautifulSoup(xml) text = "DOCTYPE foobar" self.assertEquals(soup.find(text=r).string, text) self.assertEquals(soup.find(text=text).__class__, Declaration) namespaced_doctype = ('' 'foo') soup = BeautifulSoup(namespaced_doctype) self.assertEquals(soup.contents[0], 'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"') self.assertEquals(soup.html.contents[0], 'foo') def testEntityConversions(self): text = "<<sacré bleu!>>" soup = BeautifulStoneSoup(text) self.assertSoupEquals(text) xmlEnt = Entities.XML_ENTITIES htmlEnt = Entities.HTML_ENTITIES xhtmlEnt = Entities.XHTML_ENTITIES xmlBuilder = HTMLParserXMLTreeBuilder(convertEntities=xmlEnt) htmlBuilder = HTMLParserXMLTreeBuilder(convertEntities=htmlEnt) xhtmlBuilder = HTMLParserXMLTreeBuilder(convertEntities=xhtmlEnt) soup = BeautifulStoneSoup(text, xmlBuilder) self.assertEquals(soup.decode(), "<>") soup = BeautifulStoneSoup(text, xmlBuilder) self.assertEquals(soup.decode(), "<>") soup = BeautifulStoneSoup(text, htmlBuilder) self.assertEquals(soup.decode(), u"<>") # Make sure the "XML", "HTML", and "XHTML" settings work. text = "<™'" soup = BeautifulStoneSoup(text, xmlBuilder) self.assertEquals(soup.decode(), u"<™'") soup = BeautifulStoneSoup(text, htmlBuilder) self.assertEquals(soup.decode(), u"<\u2122'") soup = BeautifulStoneSoup(text, xhtmlBuilder) self.assertEquals(soup.decode(), u"<\u2122'") def testNonBreakingSpaces(self): builder = HTMLParserTreeBuilder( convertEntities=BeautifulStoneSoup.HTML_ENTITIES) soup = BeautifulSoup(" ", builder) self.assertEquals(soup.decode(), u"\xa0\xa0") def testWhitespaceInDeclaration(self): self.assertSoupEquals('', '') def testJunkInDeclaration(self): self.assertSoupEquals('a', 'a') def testIncompleteDeclaration(self): self.assertSoupEquals('ac') def testEntityReplacement(self): self.assertSoupEquals('hello there') def testEntitiesInAttributeValues(self): self.assertSoupEquals('', '', encoding='utf-8') self.assertSoupEquals('', '', encoding='utf-8') builder = HTMLParserTreeBuilder(convertEntities=Entities.HTML_ENTITIES) soup = BeautifulSoup('', builder) self.assertEquals(soup.decode(), u'') uri = "http://crummy.com?sacré&bleu" link = '' % uri soup = BeautifulSoup(link, builder) self.assertEquals(soup.decode(), link.replace("é", u"\xe9")) uri = "http://crummy.com?sacré&bleu" link = '' % uri soup = BeautifulSoup(link, builder) self.assertEquals(soup.a['href'], uri.replace("é", u"\xe9")) def testNakedAmpersands(self): builder = HTMLParserXMLTreeBuilder(convertEntities=Entities.HTML_ENTITIES) soup = BeautifulStoneSoup("AT&T ", builder) self.assertEquals(soup.decode(), 'AT&T ') nakedAmpersandInASentence = "AT&T was Ma Bell" soup = BeautifulStoneSoup(nakedAmpersandInASentence, builder) self.assertEquals(soup.decode(), \ nakedAmpersandInASentence.replace('&','&')) invalidURL = 'foo' validURL = invalidURL.replace('&','&') soup = BeautifulStoneSoup(invalidURL) self.assertEquals(soup.decode(), validURL) soup = BeautifulStoneSoup(validURL) self.assertEquals(soup.decode(), validURL) class EncodeRed(SoupTest): """Tests encoding conversion, Unicode conversion, and Microsoft smart quote fixes.""" def testUnicodeDammitStandalone(self): markup = "\x92" dammit = UnicodeDammit(markup) self.assertEquals(dammit.unicode, "’") hebrew = "\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9') self.assertEquals(dammit.originalEncoding, 'iso-8859-8') def testGarbageInGarbageOut(self): ascii = "a" asciiSoup = BeautifulStoneSoup(ascii) self.assertEquals(ascii, asciiSoup.decode()) unicodeData = u"\u00FC" utf8 = unicodeData.encode("utf-8") self.assertEquals(utf8, '\xc3\xbc') unicodeSoup = BeautifulStoneSoup(unicodeData) self.assertEquals(unicodeData, unicodeSoup.decode()) self.assertEquals(unicodeSoup.foo.string, u'\u00FC') utf8Soup = BeautifulStoneSoup(utf8, fromEncoding='utf-8') self.assertEquals(utf8, utf8Soup.encode('utf-8')) self.assertEquals(utf8Soup.originalEncoding, "utf-8") utf8Soup = BeautifulStoneSoup(unicodeData) self.assertEquals(utf8, utf8Soup.encode('utf-8')) self.assertEquals(utf8Soup.originalEncoding, None) def testHandleInvalidCodec(self): for bad_encoding in ['.utf8', '...', 'utF---16.!']: soup = BeautifulSoup(u"Räksmörgås".encode("utf-8"), fromEncoding=bad_encoding) self.assertEquals(soup.originalEncoding, 'utf-8') def testUnicodeSearch(self): html = u'
Räksmörgås
' soup = BeautifulSoup(html) self.assertEqual(soup.find(text=u'Räksmörgås'),u'Räksmörgås') def testRewrittenXMLHeader(self): euc_jp = '\n\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n\n' utf8 = "\n\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n\n" soup = BeautifulStoneSoup(euc_jp) if soup.originalEncoding != "euc-jp": raise Exception("Test failed when parsing euc-jp document. " "If you're running Python >=2.4, or you have " "cjkcodecs installed, this is a real problem. " "Otherwise, ignore it.") self.assertEquals(soup.originalEncoding, "euc-jp") self.assertEquals(soup.encodeContents('utf-8'), utf8) old_text = "\x92" new_text = "’" self.assertSoupEquals(old_text, new_text) def testRewrittenMetaTag(self): no_shift_jis_html = '''\n
\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n
''' soup = BeautifulSoup(no_shift_jis_html) # Beautiful Soup used to try to rewrite the meta tag even if the # meta tag got filtered out by the strainer. This test makes # sure that doesn't happen. strainer = SoupStrainer('pre') soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer) self.assertEquals(soup.contents[0].name, 'pre') meta_tag = ('') shift_jis_html = ( '\n%s\n' '' '
\n' '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n' '
') % meta_tag soup = BeautifulSoup(shift_jis_html) if soup.originalEncoding != "shift-jis": raise Exception("Test failed when parsing shift-jis document " "with meta tag '%s'." "If you're running Python >=2.4, or you have " "cjkcodecs installed, this is a real problem. " "Otherwise, ignore it." % meta_tag) self.assertEquals(soup.originalEncoding, "shift-jis") content_type_tag = soup.meta['content'] self.assertEquals(content_type_tag[content_type_tag.find('charset='):], 'charset=%SOUP-ENCODING%') content_type = str(soup.meta) index = content_type.find('charset=') self.assertEqual(content_type[index:index+len('charset=utf8')+1], 'charset=utf-8') content_type = soup.meta.encode('shift-jis') index = content_type.find('charset=') self.assertEqual(content_type[index:index+len('charset=shift-jis')], 'charset=shift-jis'.encode()) self.assertEquals(soup.encode('utf-8'), ( '\n' '\n' '' '
\n' '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3' '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3' '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6' '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3' '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n' '
')) self.assertEquals(soup.encode("shift-jis"), shift_jis_html.replace('x-sjis'.encode(), 'shift-jis'.encode())) isolatin = """Sacr\xe9 bleu!""" soup = BeautifulSoup(isolatin) utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) utf8 = utf8.replace("\xe9", "\xc3\xa9") self.assertSoupEquals(soup.encode("utf-8"), utf8, encoding='utf-8') def testHebrew(self): iso_8859_8= '\nHebrew (ISO 8859-8) in Visual Directionality\n\n\n\n\n\n
Hebrew (ISO 8859-8) in Visual Directionality
\n\xed\xe5\xec\xf9\n\n' utf8 = '\nHebrew (ISO 8859-8) in Visual Directionality\n\n\n
Hebrew (ISO 8859-8) in Visual Directionality
\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n\n' soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8") self.assertEquals(soup.encode('utf-8'), utf8) def testSmartQuotesNotSoSmartAnymore(self): self.assertSoupEquals("\x91Foo\x92 ", '‘Foo’ ') def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self): smartQuotes = "Il a dit, \x8BSacré bleu!\x9b" soup = BeautifulSoup(smartQuotes) self.assertEquals(soup.decode(), 'Il a dit, ‹Sacré bleu!›') builder = HTMLParserTreeBuilder(convertEntities="html") soup = BeautifulSoup(smartQuotes, builder) self.assertEquals(soup.encode('utf-8'), 'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') def testDontSeeSmartQuotesWhereThereAreNone(self): utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch" self.assertSoupEquals(utf_8, encoding='utf-8') class Whitewash(SoupTest): """Test whitespace preservation.""" def testPreservedWhitespace(self): self.assertSoupEquals("

") self.assertSoupEquals("
woo
") def testCollapsedWhitespace(self): self.assertSoupEquals("

", "

") class AlternateBuilders(SoupTest): """Test alternate builders.""" def testICantBelieveItsValidHTML(self): builder = ICantBelieveItsValidHTMLTreeBuilder() markup = "FooBar" soup = BeautifulSoup(markup) self.assertEquals(soup.decode(), "FooBar") soup = BeautifulSoup(markup, builder=builder) self.assertEquals(soup.decode(), markup) if __name__ == '__main__': unittest.main()
foo