diff options
-rw-r--r-- | AUTHORS (renamed from src/beautifulsoup/AUTHORS) | 0 | ||||
-rw-r--r-- | CHANGELOG (renamed from src/beautifulsoup/CHANGELOG) | 0 | ||||
-rw-r--r-- | README.txt (renamed from src/beautifulsoup/README.txt) | 0 | ||||
-rw-r--r-- | TODO | 45 | ||||
-rw-r--r-- | __init__.py (renamed from src/beautifulsoup/__init__.py) | 0 | ||||
-rw-r--r-- | _bootstrap/COPYRIGHT.txt | 9 | ||||
-rw-r--r-- | _bootstrap/LICENSE.txt | 54 | ||||
-rw-r--r-- | _bootstrap/bootstrap.py | 77 | ||||
l--------- | bootstrap.py | 1 | ||||
-rw-r--r-- | buildout.cfg | 31 | ||||
-rw-r--r-- | dammit.py (renamed from src/beautifulsoup/dammit.py) | 0 | ||||
-rw-r--r-- | element.py (renamed from src/beautifulsoup/element.py) | 0 | ||||
-rw-r--r-- | lxml_test.py | 13 | ||||
-rw-r--r-- | setup.py | 44 | ||||
-rw-r--r-- | src/beautifulsoup/TODO | 42 | ||||
-rw-r--r-- | src/beautifulsoup/python3.diff | 208 | ||||
-rwxr-xr-x | testall.sh | 2 | ||||
-rw-r--r-- | testing.py (renamed from src/beautifulsoup/testing.py) | 0 | ||||
-rwxr-xr-x | to3.sh | 9 | ||||
-rw-r--r-- | util.py (renamed from src/beautifulsoup/util.py) | 0 |
20 files changed, 45 insertions, 490 deletions
diff --git a/src/beautifulsoup/AUTHORS b/AUTHORS index d353253..d353253 100644 --- a/src/beautifulsoup/AUTHORS +++ b/AUTHORS diff --git a/src/beautifulsoup/CHANGELOG b/CHANGELOG index 4e97e1b..4e97e1b 100644 --- a/src/beautifulsoup/CHANGELOG +++ b/CHANGELOG diff --git a/src/beautifulsoup/README.txt b/README.txt index ff83212..ff83212 100644 --- a/src/beautifulsoup/README.txt +++ b/README.txt @@ -6,3 +6,48 @@ Calculate tag.string dynamically rather than when creating the tree. The html5lib builder doesn't use popTag, and adding/removing things from the tree after the fact may also change the value/availability of tag.string. + +--- + +Here are some unit tests that fail with HTMLParser. + + def testValidButBogusDeclarationFAILS(self): + self.assertSoupEquals('<! Foo >a', '<!Foo >a') + + def testIncompleteDeclarationAtEndFAILS(self): + self.assertSoupEquals('a<!b') + + def testIncompleteEntityAtEndFAILS(self): + self.assertSoupEquals('<Hello>') + + # This is not what the original author had in mind, but it's + # a legitimate interpretation of what they wrote. + self.assertSoupEquals("""<a href="foo</a>, </a><a href="bar">baz</a>""", + '<a href="foo</a>, </a><a href="></a>, <a href="bar">baz</a>') + # SGMLParser generates bogus parse events when attribute values + # contain embedded brackets, but at least Beautiful Soup fixes + # it up a little. + self.assertSoupEquals('<a b="<a>">', '<a b="<a>"></a><a>"></a>') + self.assertSoupEquals('<a href="http://foo.com/<a> and blah and blah', + """<a href='"http://foo.com/'></a><a> and blah and blah</a>""") + + invalidEntity = "foo&#bar;baz" + soup = BeautifulStoneSoup\ + (invalidEntity, + convertEntities=htmlEnt) + self.assertEquals(str(soup), invalidEntity) + + +Tag names that contain Unicode characters crash the parser: + def testUnicodeTagNamesFAILS(self): + self.assertSoupEquals("<デダ芻デダtext>2PM</デダ芻デダtext>") + +Here's the implementation of NavigableString.__unicode__: + + def __unicode__(self): + return unicode(str(self)) + +It converts the Unicode to a string, and then back to Unicode. I can't +find any other way of turning an element of a Unicode subclass into a +normal Unicode object. This is pretty bad and a better technique is +welcome. diff --git a/src/beautifulsoup/__init__.py b/__init__.py index 8817164..8817164 100644 --- a/src/beautifulsoup/__init__.py +++ b/__init__.py diff --git a/_bootstrap/COPYRIGHT.txt b/_bootstrap/COPYRIGHT.txt deleted file mode 100644 index 0e07bd9..0000000 --- a/_bootstrap/COPYRIGHT.txt +++ /dev/null @@ -1,9 +0,0 @@ -Copyright (c) 2004-2009 Zope Corporation and Contributors. -All Rights Reserved. - -This software is subject to the provisions of the Zope Public License, -Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. -THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED -WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS -FOR A PARTICULAR PURPOSE. diff --git a/_bootstrap/LICENSE.txt b/_bootstrap/LICENSE.txt deleted file mode 100644 index eeb9ddf..0000000 --- a/_bootstrap/LICENSE.txt +++ /dev/null @@ -1,54 +0,0 @@ -Zope Public License (ZPL) Version 2.1 -------------------------------------- - -A copyright notice accompanies this license document that -identifies the copyright holders. - -This license has been certified as open source. It has also -been designated as GPL compatible by the Free Software -Foundation (FSF). - -Redistribution and use in source and binary forms, with or -without modification, are permitted provided that the -following conditions are met: - -1. Redistributions in source code must retain the - accompanying copyright notice, this list of conditions, - and the following disclaimer. - -2. Redistributions in binary form must reproduce the accompanying - copyright notice, this list of conditions, and the - following disclaimer in the documentation and/or other - materials provided with the distribution. - -3. Names of the copyright holders must not be used to - endorse or promote products derived from this software - without prior written permission from the copyright - holders. - -4. The right to distribute this software or to use it for - any purpose does not give you the right to use - Servicemarks (sm) or Trademarks (tm) of the copyright - holders. Use of them is covered by separate agreement - with the copyright holders. - -5. If any files are modified, you must cause the modified - files to carry prominent notices stating that you changed - the files and the date of any change. - -Disclaimer - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' - AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT - NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY - AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN - NO EVENT SHALL THE COPYRIGHT HOLDERS BE - LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE - OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH - DAMAGE.
\ No newline at end of file diff --git a/_bootstrap/bootstrap.py b/_bootstrap/bootstrap.py deleted file mode 100644 index 7728587..0000000 --- a/_bootstrap/bootstrap.py +++ /dev/null @@ -1,77 +0,0 @@ -############################################################################## -# -# Copyright (c) 2006 Zope Corporation and Contributors. -# All Rights Reserved. -# -# This software is subject to the provisions of the Zope Public License, -# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. -# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED -# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS -# FOR A PARTICULAR PURPOSE. -# -############################################################################## -"""Bootstrap a buildout-based project - -Simply run this script in a directory containing a buildout.cfg. -The script accepts buildout command-line options, so you can -use the -c option to specify an alternate configuration file. - -$Id$ -""" - -import os, shutil, sys, tempfile, urllib2 - -tmpeggs = tempfile.mkdtemp() - -is_jython = sys.platform.startswith('java') - -try: - import pkg_resources -except ImportError: - ez = {} - exec urllib2.urlopen('http://peak.telecommunity.com/dist/ez_setup.py' - ).read() in ez - ez['use_setuptools'](to_dir=tmpeggs, download_delay=0) - - import pkg_resources - -if sys.platform == 'win32': - def quote(c): - if ' ' in c: - return '"%s"' % c # work around spawn lamosity on windows - else: - return c -else: - def quote (c): - return c - -cmd = 'from setuptools.command.easy_install import main; main()' -ws = pkg_resources.working_set - -if is_jython: - import subprocess - - assert subprocess.Popen([sys.executable] + ['-c', quote(cmd), '-mqNxd', - quote(tmpeggs), 'zc.buildout'], - env=dict(os.environ, - PYTHONPATH= - ws.find(pkg_resources.Requirement.parse('setuptools')).location - ), - ).wait() == 0 - -else: - assert os.spawnle( - os.P_WAIT, sys.executable, quote (sys.executable), - '-c', quote (cmd), '-mqNxd', quote (tmpeggs), 'zc.buildout', - dict(os.environ, - PYTHONPATH= - ws.find(pkg_resources.Requirement.parse('setuptools')).location - ), - ) == 0 - -ws.add_entry(tmpeggs) -ws.require('zc.buildout') -import zc.buildout.buildout -zc.buildout.buildout.main(sys.argv[1:] + ['bootstrap']) -shutil.rmtree(tmpeggs) diff --git a/bootstrap.py b/bootstrap.py deleted file mode 120000 index 44defc0..0000000 --- a/bootstrap.py +++ /dev/null @@ -1 +0,0 @@ -_bootstrap/bootstrap.py
\ No newline at end of file diff --git a/buildout.cfg b/buildout.cfg deleted file mode 100644 index 14850fe..0000000 --- a/buildout.cfg +++ /dev/null @@ -1,31 +0,0 @@ -[buildout] -parts = - interpreter - test - docs - tags -unzip = true - -develop = . - -[test] -recipe = zc.recipe.testrunner -eggs = beautifulsoup -defaults = '--tests-pattern ^tests --exit-with-status --suite-name additional_tests'.split() - -[docs] -recipe = z3c.recipe.sphinxdoc -eggs = beautifulsoup [docs] -index-doc = README -default.css = -layout.html = - -[interpreter] -recipe = zc.recipe.egg -interpreter = py -eggs = beautifulsoup - docutils - -[tags] -recipe = z3c.recipe.tag:tags -eggs = beautifulsoup diff --git a/src/beautifulsoup/dammit.py b/dammit.py index 78bd4b2..78bd4b2 100644 --- a/src/beautifulsoup/dammit.py +++ b/dammit.py diff --git a/src/beautifulsoup/element.py b/element.py index 7649b4c..7649b4c 100644 --- a/src/beautifulsoup/element.py +++ b/element.py diff --git a/lxml_test.py b/lxml_test.py deleted file mode 100644 index 2e25c06..0000000 --- a/lxml_test.py +++ /dev/null @@ -1,13 +0,0 @@ -from BeautifulSoup import BeautifulStoneSoup, BeautifulSoup -from lxml_builder import LXMLTreeBuilder -from lxml import etree -builder = LXMLTreeBuilder(parser_class=etree.XMLParser) -soup = BeautifulStoneSoup("<foo>bar</foo>", builder=builder) -print soup.prettify() - -soup = BeautifulSoup("<foo>bar</foo>", builder=builder) -print soup.prettify() - -builder = LXMLTreeBuilder(parser_class=etree.HTMLParser, self_closing_tags=['br']) -soup = BeautifulSoup("<html><head><title>test<body><h1>page<!--Comment--><script>foo<b>bar</script><br />title</h1>", builder=builder) -print soup.prettify() diff --git a/setup.py b/setup.py deleted file mode 100644 index bd8619c..0000000 --- a/setup.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python - -import ez_setup -ez_setup.use_setuptools() - -import sys -from setuptools import setup, find_packages - -sys.path.insert(0, 'src') -from beautifulsoup import __version__ - -setup( - name='beautifulsoup', - version=__version__, - packages=find_packages('src'), - package_dir={'':'src'}, - include_package_data=True, - zip_safe=False, - maintainer='Leonard Richardson', - maintainer_email='leonardr@segfault.org', - long_description="""Beautiful Soup parses arbitrarily invalid XML/HTML and provides a variety of methods and Pythonic idioms for iterating and searching the parse tree.""", - license='New-style BSD', - install_requires=[ - 'setuptools', - 'zope.interface', - ], - url='https://launchpad.net/beautifulsoup', - download_url= 'https://launchpad.net/beautifulsoup/+download', - classifiers=["Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "License :: OSI Approved :: Python Software Foundation License", - "Programming Language :: Python", - "Topic :: Text Processing :: Markup :: HTML", - "Topic :: Text Processing :: Markup :: XML", - "Topic :: Text Processing :: Markup :: SGML", - "Topic :: Software Development :: Libraries :: Python Modules", - ], - extras_require=dict( - docs=['Sphinx', - 'z3c.recipe.sphinxdoc'] - ), - setup_requires=['eggtestinfo', 'setuptools_bzr'], - test_suite='beautifulsoup.tests', - ) diff --git a/src/beautifulsoup/TODO b/src/beautifulsoup/TODO deleted file mode 100644 index 84fa273..0000000 --- a/src/beautifulsoup/TODO +++ /dev/null @@ -1,42 +0,0 @@ -Here are some unit tests that fail with HTMLParser. - - def testValidButBogusDeclarationFAILS(self): - self.assertSoupEquals('<! Foo >a', '<!Foo >a') - - def testIncompleteDeclarationAtEndFAILS(self): - self.assertSoupEquals('a<!b') - - def testIncompleteEntityAtEndFAILS(self): - self.assertSoupEquals('<Hello>') - - # This is not what the original author had in mind, but it's - # a legitimate interpretation of what they wrote. - self.assertSoupEquals("""<a href="foo</a>, </a><a href="bar">baz</a>""", - '<a href="foo</a>, </a><a href="></a>, <a href="bar">baz</a>') - # SGMLParser generates bogus parse events when attribute values - # contain embedded brackets, but at least Beautiful Soup fixes - # it up a little. - self.assertSoupEquals('<a b="<a>">', '<a b="<a>"></a><a>"></a>') - self.assertSoupEquals('<a href="http://foo.com/<a> and blah and blah', - """<a href='"http://foo.com/'></a><a> and blah and blah</a>""") - - invalidEntity = "foo&#bar;baz" - soup = BeautifulStoneSoup\ - (invalidEntity, - convertEntities=htmlEnt) - self.assertEquals(str(soup), invalidEntity) - - -Tag names that contain Unicode characters crash the parser: - def testUnicodeTagNamesFAILS(self): - self.assertSoupEquals("<f_䍃f_text>2PM</f_䍃f_text>") - -Here's the implementation of NavigableString.__unicode__: - - def __unicode__(self): - return unicode(str(self)) - -It converts the Unicode to a string, and then back to Unicode. I can't -find any other way of turning an element of a Unicode subclass into a -normal Unicode object. This is pretty bad and a better technique is -welcome. diff --git a/src/beautifulsoup/python3.diff b/src/beautifulsoup/python3.diff deleted file mode 100644 index 142f2b1..0000000 --- a/src/beautifulsoup/python3.diff +++ /dev/null @@ -1,208 +0,0 @@ -=== modified file 'src/beautifulsoup/builder.py' ---- src/beautifulsoup/builder.py 2009-04-10 15:22:53 +0000 -+++ src/beautifulsoup/builder.py 2009-04-10 17:12:49 +0000 -@@ -6,7 +6,7 @@ - from element import name2codepoint - from element import ( - CData, Comment, Declaration, Entities, ProcessingInstruction) --from HTMLParser import HTMLParser, HTMLParseError -+from html.parser import HTMLParser, HTMLParseError - - __all__ = ['TreeBuilder', - 'HTMLParserXMLTreeBuilder', - -=== modified file 'src/beautifulsoup/element.py' ---- src/beautifulsoup/element.py 2009-04-10 15:22:53 +0000 -+++ src/beautifulsoup/element.py 2009-04-10 17:12:49 +0000 -@@ -1,7 +1,7 @@ - import re - import types - try: -- from htmlentitydefs import name2codepoint -+ from html.entities import name2codepoint - except ImportError: - name2codepoint = {} - -@@ -254,7 +254,7 @@ - g = generator() - while True: - try: -- i = g.next() -+ i = g.__next__() - except StopIteration: - break - if i: - -=== modified file 'src/beautifulsoup/tests/test_soup.py' ---- src/beautifulsoup/tests/test_soup.py 2009-04-10 15:45:04 +0000 -+++ src/beautifulsoup/tests/test_soup.py 2009-04-10 17:15:31 +0000 -@@ -635,9 +635,9 @@ - self.assertSoupEquals('<b>hello there</b>') - - def testEntitiesInAttributeValues(self): -- self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', -+ self.assertSoupEquals('<x t="xñ">', b'<x t="x\xc3\xb1"></x>', - encoding='utf-8') -- self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', -+ self.assertSoupEquals('<x t="xñ">', b'<x t="x\xc3\xb1"></x>', - encoding='utf-8') - - builder = HTMLParserTreeBuilder(convertEntities=Entities.HTML_ENTITIES) -@@ -681,11 +681,11 @@ - smart quote fixes.""" - - def testUnicodeDammitStandalone(self): -- markup = "<foo>\x92</foo>" -+ markup = b"<foo>\x92</foo>" - dammit = UnicodeDammit(markup) - self.assertEquals(dammit.unicode, "<foo>’</foo>") - -- hebrew = "\xed\xe5\xec\xf9" -+ hebrew = b"\xed\xe5\xec\xf9" - dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) - self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9') - self.assertEquals(dammit.originalEncoding, 'iso-8859-8') -@@ -697,7 +697,7 @@ - - unicodeData = u"<foo>\u00FC</foo>" - utf8 = unicodeData.encode("utf-8") -- self.assertEquals(utf8, '<foo>\xc3\xbc</foo>') -+ self.assertEquals(utf8, b'<foo>\xc3\xbc</foo>') - - unicodeSoup = BeautifulStoneSoup(unicodeData) - self.assertEquals(unicodeData, unicodeSoup.decode()) -@@ -724,8 +724,8 @@ - self.assertEqual(soup.find(text=u'Räksmörgås'),u'Räksmörgås') - - def testRewrittenXMLHeader(self): -- euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n' -- utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n" -+ euc_jp = b'<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n' -+ utf8 = b"<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n" - soup = BeautifulStoneSoup(euc_jp) - if soup.originalEncoding != "euc-jp": - raise Exception("Test failed when parsing euc-jp document. " -@@ -736,12 +736,12 @@ - self.assertEquals(soup.originalEncoding, "euc-jp") - self.assertEquals(soup.renderContents('utf-8'), utf8) - -- old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>" -+ old_text = b"<?xml encoding='windows-1252'><foo>\x92</foo>" - new_text = "<?xml version='1.0' encoding='utf-8'?><foo>’</foo>" - self.assertSoupEquals(old_text, new_text) - - def testRewrittenMetaTag(self): -- no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>''' -+ no_shift_jis_html = b'''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>''' - soup = BeautifulSoup(no_shift_jis_html) - - # Beautiful Soup used to try to rewrite the meta tag even if the -@@ -751,16 +751,16 @@ - soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer) - self.assertEquals(soup.contents[0].name, 'pre') - -- meta_tag = ('<meta content="text/html; charset=x-sjis" ' -- 'http-equiv="Content-type" />') -+ meta_tag = (b'<meta content="text/html; charset=x-sjis" ' -+ b'http-equiv="Content-type" />') - shift_jis_html = ( -- '<html><head>\n%s\n' -- '<meta http-equiv="Content-language" content="ja" />' -- '</head><body><pre>\n' -- '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' -- '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' -- '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n' -- '</pre></body></html>') % meta_tag -+ b'<html><head>\n' + meta_tag + b'\n' -+ b'<meta http-equiv="Content-language" content="ja" />' -+ b'</head><body><pre>\n' -+ b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' -+ b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' -+ b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n' -+ b'</pre></body></html>') - soup = BeautifulSoup(shift_jis_html) - if soup.originalEncoding != "shift-jis": - raise Exception("Test failed when parsing shift-jis document " -@@ -773,60 +773,60 @@ - content_type_tag = soup.meta['content'] - self.assertEquals(content_type_tag[content_type_tag.find('charset='):], - 'charset=%SOUP-ENCODING%') -- content_type = str(soup.meta) -+ content_type = soup.meta.decode() - index = content_type.find('charset=') - self.assertEqual(content_type[index:index+len('charset=utf8')+1], - 'charset=utf-8') - content_type = soup.meta.encode('shift-jis') -- index = content_type.find('charset=') -+ index = content_type.find(b'charset=') - self.assertEqual(content_type[index:index+len('charset=shift-jis')], - 'charset=shift-jis'.encode()) - - self.assertEquals(soup.encode('utf-8'), ( -- '<html><head>\n' -- '<meta content="text/html; charset=utf-8" ' -- 'http-equiv="Content-type" />\n' -- '<meta http-equiv="Content-language" content="ja" />' -- '</head><body><pre>\n' -- '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3' -- '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3' -- '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6' -- '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3' -- '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n' -- '</pre></body></html>')) -+ b'<html><head>\n' -+ b'<meta content="text/html; charset=utf-8" ' -+ b'http-equiv="Content-type" />\n' -+ b'<meta http-equiv="Content-language" content="ja" />' -+ b'</head><body><pre>\n' -+ b'\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3' -+ b'\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3' -+ b'\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6' -+ b'\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3' -+ b'\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n' -+ b'</pre></body></html>')) - self.assertEquals(soup.encode("shift-jis"), - shift_jis_html.replace('x-sjis'.encode(), - 'shift-jis'.encode())) - -- isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" -+ isolatin = b"""<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" - soup = BeautifulSoup(isolatin) - - utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) -- utf8 = utf8.replace("\xe9", "\xc3\xa9") -+ utf8 = utf8.replace(b"\xe9", b"\xc3\xa9") - self.assertSoupEquals(soup.encode("utf-8"), utf8, encoding='utf-8') - - def testHebrew(self): -- iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n' -- utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n' -+ iso_8859_8= b'<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n' -+ utf8 = b'<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n' - soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8") - self.assertEquals(soup.encode('utf-8'), utf8) - - def testSmartQuotesNotSoSmartAnymore(self): -- self.assertSoupEquals("\x91Foo\x92 <!--blah-->", -+ self.assertSoupEquals(b"\x91Foo\x92 <!--blah-->", - '‘Foo’ <!--blah-->') - - def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self): -- smartQuotes = "Il a dit, \x8BSacré bleu!\x9b" -+ smartQuotes = b"Il a dit, \x8BSacré bleu!\x9b" - soup = BeautifulSoup(smartQuotes) - self.assertEquals(soup.decode(), - 'Il a dit, ‹Sacré bleu!›') - builder = HTMLParserTreeBuilder(convertEntities="html") - soup = BeautifulSoup(smartQuotes, builder) - self.assertEquals(soup.encode('utf-8'), -- 'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') -+ b'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') - - def testDontSeeSmartQuotesWhereThereAreNone(self): -- utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch" -+ utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" - self.assertSoupEquals(utf_8, encoding='utf-8') - - - diff --git a/testall.sh b/testall.sh deleted file mode 100755 index 801124f..0000000 --- a/testall.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -python BeautifulSoupTests.py && sh to3.sh && cd python3 && python3 BeautifulSoupTests.py diff --git a/src/beautifulsoup/testing.py b/testing.py index 20d087e..20d087e 100644 --- a/src/beautifulsoup/testing.py +++ b/testing.py @@ -1,9 +0,0 @@ -#!/bin/sh -mkdir python3 -for i in BeautifulSoupTests.py builder.py element.py dammit.py -do - cp $i python3/ - 2to3-3.0 -x next $i | patch -p0 python3/$i - cp python3/$i python3/$i.orig - patch -p0 python3/$i < $i.3.diff -done
\ No newline at end of file diff --git a/src/beautifulsoup/util.py b/util.py index 693a7e2..693a7e2 100644 --- a/src/beautifulsoup/util.py +++ b/util.py |