summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt25
-rw-r--r--bs4/__init__.py2
-rw-r--r--bs4/builder/_htmlparser.py2
-rw-r--r--bs4/element.py7
-rw-r--r--bs4/testing.py2
-rw-r--r--bs4/tests/test_tree.py7
-rw-r--r--doc/source/index.rst18
-rw-r--r--setup.py2
8 files changed, 32 insertions, 33 deletions
diff --git a/NEWS.txt b/NEWS.txt
index a3485e7..3d0846f 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,9 +1,22 @@
= 4.3.0 (Unreleased) =
-* A NavigableString object now has an immutable '.name' property whose
- value is always None. This makes it easier to iterate over a mixed
- list of tags and strings without having to check whether each
- element is a tag or a string.
+* Instead of converting incoming data to Unicode and feeding it to the
+ lxml tree builder, Beautiful Soup now makes successive guesses at
+ the encoding of the incoming data, and tells lxml to parse the data
+ as that encoding. This improves performance and avoids an issue in
+ which lxml was refusing to parse strings because they were Unicode
+ strings.
+
+ This required a major overhaul of the tree builder architecture. If
+ you wrote your own tree builder and didn't tell me, you'll need to
+ modify your prepare_markup() method.
+
+* The UnicodeDammit code that makes guesses at encodings has been
+ split into its own class, EncodingDetector. A lot of apparently
+ redundant code has been removed from Unicode, Dammit, and some
+ undocumented features have also been removed.
+
+= 4.2.1 (20130531) =
* The default XML formatter will now replace ampersands even if they
appear to be part of entities. That is, "<" will become
@@ -29,6 +42,10 @@
* html5lib now supports Python 3. Fixed some Python 2-specific
code in the html5lib test suite. [bug=1181624]
+* The html.parser treebuilder can now handle numeric attributes in
+ text when the hexidecimal name of the attribute starts with a
+ capital X. Patch by Tim Shirley. [bug=1186242]
+
= 4.2.0 (20130514) =
* The Tag.select() method now supports a much wider variety of CSS
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 956f26e..7b5964a 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -17,7 +17,7 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.3.0"
+__version__ = "4.2.1"
__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
__license__ = "MIT"
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 2b98969..4b80f79 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -58,6 +58,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
# it's fixed.
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
+ elif name.startswith('X'):
+ real_name = int(name.lstrip('X'), 16)
else:
real_name = int(name)
diff --git a/bs4/element.py b/bs4/element.py
index 538f6b6..f6864f2 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -672,13 +672,6 @@ class NavigableString(unicode, PageElement):
output = self.format_string(self, formatter)
return self.PREFIX + output + self.SUFFIX
- @property
- def name(self):
- return None
-
- @name.setter
- def name(self, name):
- raise AttributeError("A NavigableString cannot be given a name.")
class PreformattedString(NavigableString):
"""A NavigableString not subject to the normal formatting rules.
diff --git a/bs4/testing.py b/bs4/testing.py
index c363a89..fd4495a 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -228,12 +228,14 @@ class HTMLTreeBuilderSmokeTest(object):
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
+ self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
def test_entities_in_text_converted_to_unicode(self):
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
+ self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
def test_quot_entity_converted_to_quotation_mark(self):
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index fc0e2c6..2d09f96 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1187,13 +1187,6 @@ class TestElementObjects(SoupTest):
soup = self.soup("foo<!--IGNORE-->bar")
self.assertEqual(['foo', 'bar'], list(soup.strings))
- def test_string_has_immutable_name_property(self):
- string = self.soup("s").string
- self.assertEqual(None, string.name)
- def t():
- string.name = 'foo'
- self.assertRaises(AttributeError, t)
-
class TestCDAtaListAttributes(SoupTest):
"""Testing cdata-list attributes like 'class'.
diff --git a/doc/source/index.rst b/doc/source/index.rst
index a91854c..1b38df7 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2478,9 +2478,11 @@ become Unicode::
dammit.original_encoding
# 'utf-8'
-The more data you give Unicode, Dammit, the more accurately it will
-guess. If you have your own suspicions as to what the encoding might
-be, you can pass them in as a list::
+Unicode, Dammit's guesses will get a lot more accurate if you install
+the ``chardet`` or ``cchardet`` Python libraries. The more data you
+give Unicode, Dammit, the more accurately it will guess. If you have
+your own suspicions as to what the encoding might be, you can pass
+them in as a list::
dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"])
print(dammit.unicode_markup)
@@ -2823,16 +2825,6 @@ significantly faster using lxml than using html.parser or html5lib.
You can speed up encoding detection significantly by installing the
`cchardet <http://pypi.python.org/pypi/cchardet/>`_ library.
-Sometimes `Unicode, Dammit`_ can only detect the encoding of a file by
-doing a byte-by-byte examination of the file. This slows Beautiful
-Soup to a crawl. My tests indicate that this only happened on 2.x
-versions of Python, and that it happened most often with documents
-using Russian or Chinese encodings. If this is happening to you, you
-can fix it by installing cchardet, or by using Python 3 for your
-script. If you happen to know a document's encoding, you can pass
-it into the ``BeautifulSoup`` constructor as ``from_encoding``, and
-bypass encoding detection altogether.
-
`Parsing only part of a document`_ won't save you much time parsing
the document, but it can save a lot of memory, and it'll make
`searching` the document much faster.
diff --git a/setup.py b/setup.py
index c1eb127..96457cd 100644
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,7 @@ except ImportError:
from distutils.command.build_py import build_py
setup(name="beautifulsoup4",
- version = "4.2.0",
+ version = "4.2.1",
author="Leonard Richardson",
author_email='leonardr@segfault.org',
url="http://www.crummy.com/software/BeautifulSoup/bs4/",