diff options
-rw-r--r-- | NEWS.txt | 25 | ||||
-rw-r--r-- | bs4/__init__.py | 2 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 2 | ||||
-rw-r--r-- | bs4/element.py | 7 | ||||
-rw-r--r-- | bs4/testing.py | 2 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 7 | ||||
-rw-r--r-- | doc/source/index.rst | 18 | ||||
-rw-r--r-- | setup.py | 2 |
8 files changed, 32 insertions, 33 deletions
@@ -1,9 +1,22 @@ = 4.3.0 (Unreleased) = -* A NavigableString object now has an immutable '.name' property whose - value is always None. This makes it easier to iterate over a mixed - list of tags and strings without having to check whether each - element is a tag or a string. +* Instead of converting incoming data to Unicode and feeding it to the + lxml tree builder, Beautiful Soup now makes successive guesses at + the encoding of the incoming data, and tells lxml to parse the data + as that encoding. This improves performance and avoids an issue in + which lxml was refusing to parse strings because they were Unicode + strings. + + This required a major overhaul of the tree builder architecture. If + you wrote your own tree builder and didn't tell me, you'll need to + modify your prepare_markup() method. + +* The UnicodeDammit code that makes guesses at encodings has been + split into its own class, EncodingDetector. A lot of apparently + redundant code has been removed from Unicode, Dammit, and some + undocumented features have also been removed. + += 4.2.1 (20130531) = * The default XML formatter will now replace ampersands even if they appear to be part of entities. That is, "<" will become @@ -29,6 +42,10 @@ * html5lib now supports Python 3. Fixed some Python 2-specific code in the html5lib test suite. [bug=1181624] +* The html.parser treebuilder can now handle numeric attributes in + text when the hexidecimal name of the attribute starts with a + capital X. Patch by Tim Shirley. [bug=1186242] + = 4.2.0 (20130514) = * The Tag.select() method now supports a much wider variety of CSS diff --git a/bs4/__init__.py b/bs4/__init__.py index 956f26e..7b5964a 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -17,7 +17,7 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.3.0" +__version__ = "4.2.1" __copyright__ = "Copyright (c) 2004-2013 Leonard Richardson" __license__ = "MIT" diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 2b98969..4b80f79 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -58,6 +58,8 @@ class BeautifulSoupHTMLParser(HTMLParser): # it's fixed. if name.startswith('x'): real_name = int(name.lstrip('x'), 16) + elif name.startswith('X'): + real_name = int(name.lstrip('X'), 16) else: real_name = int(name) diff --git a/bs4/element.py b/bs4/element.py index 538f6b6..f6864f2 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -672,13 +672,6 @@ class NavigableString(unicode, PageElement): output = self.format_string(self, formatter) return self.PREFIX + output + self.SUFFIX - @property - def name(self): - return None - - @name.setter - def name(self, name): - raise AttributeError("A NavigableString cannot be given a name.") class PreformattedString(NavigableString): """A NavigableString not subject to the normal formatting rules. diff --git a/bs4/testing.py b/bs4/testing.py index c363a89..fd4495a 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -228,12 +228,14 @@ class HTMLTreeBuilderSmokeTest(object): expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' self.assertSoupEquals('<p id="piñata"></p>', expect) self.assertSoupEquals('<p id="piñata"></p>', expect) + self.assertSoupEquals('<p id="piñata"></p>', expect) self.assertSoupEquals('<p id="piñata"></p>', expect) def test_entities_in_text_converted_to_unicode(self): expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' self.assertSoupEquals("<p>piñata</p>", expect) self.assertSoupEquals("<p>piñata</p>", expect) + self.assertSoupEquals("<p>piñata</p>", expect) self.assertSoupEquals("<p>piñata</p>", expect) def test_quot_entity_converted_to_quotation_mark(self): diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index fc0e2c6..2d09f96 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1187,13 +1187,6 @@ class TestElementObjects(SoupTest): soup = self.soup("foo<!--IGNORE-->bar") self.assertEqual(['foo', 'bar'], list(soup.strings)) - def test_string_has_immutable_name_property(self): - string = self.soup("s").string - self.assertEqual(None, string.name) - def t(): - string.name = 'foo' - self.assertRaises(AttributeError, t) - class TestCDAtaListAttributes(SoupTest): """Testing cdata-list attributes like 'class'. diff --git a/doc/source/index.rst b/doc/source/index.rst index a91854c..1b38df7 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -2478,9 +2478,11 @@ become Unicode:: dammit.original_encoding # 'utf-8' -The more data you give Unicode, Dammit, the more accurately it will -guess. If you have your own suspicions as to what the encoding might -be, you can pass them in as a list:: +Unicode, Dammit's guesses will get a lot more accurate if you install +the ``chardet`` or ``cchardet`` Python libraries. The more data you +give Unicode, Dammit, the more accurately it will guess. If you have +your own suspicions as to what the encoding might be, you can pass +them in as a list:: dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"]) print(dammit.unicode_markup) @@ -2823,16 +2825,6 @@ significantly faster using lxml than using html.parser or html5lib. You can speed up encoding detection significantly by installing the `cchardet <http://pypi.python.org/pypi/cchardet/>`_ library. -Sometimes `Unicode, Dammit`_ can only detect the encoding of a file by -doing a byte-by-byte examination of the file. This slows Beautiful -Soup to a crawl. My tests indicate that this only happened on 2.x -versions of Python, and that it happened most often with documents -using Russian or Chinese encodings. If this is happening to you, you -can fix it by installing cchardet, or by using Python 3 for your -script. If you happen to know a document's encoding, you can pass -it into the ``BeautifulSoup`` constructor as ``from_encoding``, and -bypass encoding detection altogether. - `Parsing only part of a document`_ won't save you much time parsing the document, but it can save a lot of memory, and it'll make `searching` the document much faster. @@ -7,7 +7,7 @@ except ImportError: from distutils.command.build_py import build_py setup(name="beautifulsoup4", - version = "4.2.0", + version = "4.2.1", author="Leonard Richardson", author_email='leonardr@segfault.org', url="http://www.crummy.com/software/BeautifulSoup/bs4/", |