diff options
-rw-r--r-- | NEWS.txt | 5 | ||||
-rw-r--r-- | TODO.txt | 3 | ||||
-rw-r--r-- | bs4/__init__.py | 2 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 70 | ||||
-rw-r--r-- | bs4/tests/test_htmlparser.py | 6 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 2 | ||||
-rw-r--r-- | setup.py | 2 |
7 files changed, 51 insertions, 39 deletions
@@ -9,6 +9,11 @@ * Restored compatibility with Python 2.6. +* The install process no longer installs docs or auxillary text files. + +* It's now possible to deepcopy a BeautifulSoup object created with + Python's built-in HTML parser. + = 4.0.0b6 (20110216) = * Multi-valued attributes like "class" always have a list of values, @@ -1,9 +1,6 @@ Bugs ---- -* You can't deepcopy a tree if it was created with the html.parser - tree builder. - * html5lib doesn't support SoupStrainers, which is OK, but there should be a warning about it. diff --git a/bs4/__init__.py b/bs4/__init__.py index 98ac57b..13dac85 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -17,7 +17,7 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.0.0b6" +__version__ = "4.0.0b7" __copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" __license__ = "MIT" diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 0ec878b..62473cf 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -38,35 +38,7 @@ from bs4.builder import ( HTMLPARSER = 'html.parser' -class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder): - - is_xml = False - features = [HTML, STRICT, HTMLPARSER] - - def __init__(self, *args, **kwargs): - if CONSTRUCTOR_TAKES_STRICT: - kwargs['strict'] = False - return super(HTMLParserTreeBuilder, self).__init__(*args, **kwargs) - - def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): - """ - :return: A 4-tuple (markup, original encoding, encoding - declared within markup, whether any characters had to be - replaced with REPLACEMENT CHARACTER). - """ - if isinstance(markup, unicode): - return markup, None, None, False - - try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True) - return (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding, - dammit.contains_replacement_characters) - - def feed(self, markup): - super(HTMLParserTreeBuilder, self).feed(markup) - +class BeautifulSoupHTMLParser(HTMLParser): def handle_starttag(self, name, attrs): self.soup.handle_starttag(name, dict(attrs)) @@ -126,6 +98,40 @@ class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder): self.soup.handle_data(data) self.soup.endData(ProcessingInstruction) + +class HTMLParserTreeBuilder(HTMLTreeBuilder): + + is_xml = False + features = [HTML, STRICT, HTMLPARSER] + + def __init__(self, *args, **kwargs): + if CONSTRUCTOR_TAKES_STRICT: + kwargs['strict'] = False + self.parser_args = (args, kwargs) + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + """ + :return: A 4-tuple (markup, original encoding, encoding + declared within markup, whether any characters had to be + replaced with REPLACEMENT CHARACTER). + """ + if isinstance(markup, unicode): + return markup, None, None, False + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, is_html=True) + return (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) + + def feed(self, markup): + args, kwargs = self.parser_args + parser = BeautifulSoupHTMLParser(*args, **kwargs) + parser.soup = self.soup + parser.feed(markup) + + # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # 3.2.3 code. This ensures they don't treat markup like <p></p> as a # string. @@ -152,7 +158,7 @@ if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: )* \s* # trailing whitespace """, re.VERBOSE) - HTMLParserTreeBuilder.locatestarttagend = locatestarttagend + BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend from html.parser import tagfind, attrfind @@ -215,7 +221,7 @@ if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: self.cdata_elem = elem.lower() self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) - HTMLParserTreeBuilder.parse_starttag = parse_starttag - HTMLParserTreeBuilder.set_cdata_mode = set_cdata_mode + BeautifulSoupHTMLParser.parse_starttag = parse_starttag + BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode CONSTRUCTOR_TAKES_STRICT = True diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index a066fe0..d5b6ae1 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -1,3 +1,4 @@ +import copy from HTMLParser import HTMLParseError from bs4.element import Comment, Doctype, SoupStrainer from bs4.builder import HTMLParserTreeBuilder @@ -338,6 +339,11 @@ class TestHTMLParserTreeBuilder(SoupTest): parse_only=strainer) self.assertEqual(soup.decode(), "<b>bold</b>") + def test_deepcopy(self): + # Make sure you can copy the builder. This is important because + # the builder is part of a BeautifulSoup object, and we want to be + # able to copy that. + copy.deepcopy(self.default_builder) class TestHTMLParserTreeBuilderInvalidMarkup(SoupTest): """Tests of invalid markup for the default tree builder. diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index d6d8dcb..f39826a 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1122,8 +1122,6 @@ class TestPersistence(SoupTest): self.assertEqual(loaded.__class__, BeautifulSoup) self.assertEqual(loaded.decode(), self.tree.decode()) - @skipIf(not LXML_PRESENT, - "Skipping deepcopy test to work around htmlparser bug.") def test_deepcopy_identity(self): # Making a deepcopy of a tree yields an identical tree. copied = copy.deepcopy(self.tree) @@ -7,7 +7,7 @@ except ImportError: from distutils.command.build_py import build_py setup(name="beautifulsoup4", - version = "4.0.0b6", + version = "4.0.0b7", author="Leonard Richardson", author_email='leonardr@segfault.org', url="http://www.crummy.com/software/BeautifulSoup/bs4/", |