summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt5
-rw-r--r--TODO.txt3
-rw-r--r--bs4/__init__.py2
-rw-r--r--bs4/builder/_htmlparser.py70
-rw-r--r--bs4/tests/test_htmlparser.py6
-rw-r--r--bs4/tests/test_tree.py2
-rw-r--r--setup.py2
7 files changed, 51 insertions, 39 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 61c04ce..54caf6a 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -9,6 +9,11 @@
* Restored compatibility with Python 2.6.
+* The install process no longer installs docs or auxillary text files.
+
+* It's now possible to deepcopy a BeautifulSoup object created with
+ Python's built-in HTML parser.
+
= 4.0.0b6 (20110216) =
* Multi-valued attributes like "class" always have a list of values,
diff --git a/TODO.txt b/TODO.txt
index fc305fb..61f9aee 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,9 +1,6 @@
Bugs
----
-* You can't deepcopy a tree if it was created with the html.parser
- tree builder.
-
* html5lib doesn't support SoupStrainers, which is OK, but there
should be a warning about it.
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 98ac57b..13dac85 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -17,7 +17,7 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.0.0b6"
+__version__ = "4.0.0b7"
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
__license__ = "MIT"
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 0ec878b..62473cf 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -38,35 +38,7 @@ from bs4.builder import (
HTMLPARSER = 'html.parser'
-class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder):
-
- is_xml = False
- features = [HTML, STRICT, HTMLPARSER]
-
- def __init__(self, *args, **kwargs):
- if CONSTRUCTOR_TAKES_STRICT:
- kwargs['strict'] = False
- return super(HTMLParserTreeBuilder, self).__init__(*args, **kwargs)
-
- def prepare_markup(self, markup, user_specified_encoding=None,
- document_declared_encoding=None):
- """
- :return: A 4-tuple (markup, original encoding, encoding
- declared within markup, whether any characters had to be
- replaced with REPLACEMENT CHARACTER).
- """
- if isinstance(markup, unicode):
- return markup, None, None, False
-
- try_encodings = [user_specified_encoding, document_declared_encoding]
- dammit = UnicodeDammit(markup, try_encodings, is_html=True)
- return (dammit.markup, dammit.original_encoding,
- dammit.declared_html_encoding,
- dammit.contains_replacement_characters)
-
- def feed(self, markup):
- super(HTMLParserTreeBuilder, self).feed(markup)
-
+class BeautifulSoupHTMLParser(HTMLParser):
def handle_starttag(self, name, attrs):
self.soup.handle_starttag(name, dict(attrs))
@@ -126,6 +98,40 @@ class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder):
self.soup.handle_data(data)
self.soup.endData(ProcessingInstruction)
+
+class HTMLParserTreeBuilder(HTMLTreeBuilder):
+
+ is_xml = False
+ features = [HTML, STRICT, HTMLPARSER]
+
+ def __init__(self, *args, **kwargs):
+ if CONSTRUCTOR_TAKES_STRICT:
+ kwargs['strict'] = False
+ self.parser_args = (args, kwargs)
+
+ def prepare_markup(self, markup, user_specified_encoding=None,
+ document_declared_encoding=None):
+ """
+ :return: A 4-tuple (markup, original encoding, encoding
+ declared within markup, whether any characters had to be
+ replaced with REPLACEMENT CHARACTER).
+ """
+ if isinstance(markup, unicode):
+ return markup, None, None, False
+
+ try_encodings = [user_specified_encoding, document_declared_encoding]
+ dammit = UnicodeDammit(markup, try_encodings, is_html=True)
+ return (dammit.markup, dammit.original_encoding,
+ dammit.declared_html_encoding,
+ dammit.contains_replacement_characters)
+
+ def feed(self, markup):
+ args, kwargs = self.parser_args
+ parser = BeautifulSoupHTMLParser(*args, **kwargs)
+ parser.soup = self.soup
+ parser.feed(markup)
+
+
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
@@ -152,7 +158,7 @@ if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
)*
\s* # trailing whitespace
""", re.VERBOSE)
- HTMLParserTreeBuilder.locatestarttagend = locatestarttagend
+ BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
from html.parser import tagfind, attrfind
@@ -215,7 +221,7 @@ if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
- HTMLParserTreeBuilder.parse_starttag = parse_starttag
- HTMLParserTreeBuilder.set_cdata_mode = set_cdata_mode
+ BeautifulSoupHTMLParser.parse_starttag = parse_starttag
+ BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
CONSTRUCTOR_TAKES_STRICT = True
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index a066fe0..d5b6ae1 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -1,3 +1,4 @@
+import copy
from HTMLParser import HTMLParseError
from bs4.element import Comment, Doctype, SoupStrainer
from bs4.builder import HTMLParserTreeBuilder
@@ -338,6 +339,11 @@ class TestHTMLParserTreeBuilder(SoupTest):
parse_only=strainer)
self.assertEqual(soup.decode(), "<b>bold</b>")
+ def test_deepcopy(self):
+ # Make sure you can copy the builder. This is important because
+ # the builder is part of a BeautifulSoup object, and we want to be
+ # able to copy that.
+ copy.deepcopy(self.default_builder)
class TestHTMLParserTreeBuilderInvalidMarkup(SoupTest):
"""Tests of invalid markup for the default tree builder.
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index d6d8dcb..f39826a 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1122,8 +1122,6 @@ class TestPersistence(SoupTest):
self.assertEqual(loaded.__class__, BeautifulSoup)
self.assertEqual(loaded.decode(), self.tree.decode())
- @skipIf(not LXML_PRESENT,
- "Skipping deepcopy test to work around htmlparser bug.")
def test_deepcopy_identity(self):
# Making a deepcopy of a tree yields an identical tree.
copied = copy.deepcopy(self.tree)
diff --git a/setup.py b/setup.py
index 46d5ff2..0d5b7d7 100644
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,7 @@ except ImportError:
from distutils.command.build_py import build_py
setup(name="beautifulsoup4",
- version = "4.0.0b6",
+ version = "4.0.0b7",
author="Leonard Richardson",
author_email='leonardr@segfault.org',
url="http://www.crummy.com/software/BeautifulSoup/bs4/",