diff options
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/__init__.py | 2 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 70 | ||||
-rw-r--r-- | bs4/tests/test_htmlparser.py | 6 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 2 |
4 files changed, 45 insertions, 35 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index 98ac57b..13dac85 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -17,7 +17,7 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.0.0b6" +__version__ = "4.0.0b7" __copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" __license__ = "MIT" diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 0ec878b..62473cf 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -38,35 +38,7 @@ from bs4.builder import ( HTMLPARSER = 'html.parser' -class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder): - - is_xml = False - features = [HTML, STRICT, HTMLPARSER] - - def __init__(self, *args, **kwargs): - if CONSTRUCTOR_TAKES_STRICT: - kwargs['strict'] = False - return super(HTMLParserTreeBuilder, self).__init__(*args, **kwargs) - - def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): - """ - :return: A 4-tuple (markup, original encoding, encoding - declared within markup, whether any characters had to be - replaced with REPLACEMENT CHARACTER). - """ - if isinstance(markup, unicode): - return markup, None, None, False - - try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True) - return (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding, - dammit.contains_replacement_characters) - - def feed(self, markup): - super(HTMLParserTreeBuilder, self).feed(markup) - +class BeautifulSoupHTMLParser(HTMLParser): def handle_starttag(self, name, attrs): self.soup.handle_starttag(name, dict(attrs)) @@ -126,6 +98,40 @@ class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder): self.soup.handle_data(data) self.soup.endData(ProcessingInstruction) + +class HTMLParserTreeBuilder(HTMLTreeBuilder): + + is_xml = False + features = [HTML, STRICT, HTMLPARSER] + + def __init__(self, *args, **kwargs): + if CONSTRUCTOR_TAKES_STRICT: + kwargs['strict'] = False + self.parser_args = (args, kwargs) + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + """ + :return: A 4-tuple (markup, original encoding, encoding + declared within markup, whether any characters had to be + replaced with REPLACEMENT CHARACTER). + """ + if isinstance(markup, unicode): + return markup, None, None, False + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, is_html=True) + return (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) + + def feed(self, markup): + args, kwargs = self.parser_args + parser = BeautifulSoupHTMLParser(*args, **kwargs) + parser.soup = self.soup + parser.feed(markup) + + # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # 3.2.3 code. This ensures they don't treat markup like <p></p> as a # string. @@ -152,7 +158,7 @@ if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: )* \s* # trailing whitespace """, re.VERBOSE) - HTMLParserTreeBuilder.locatestarttagend = locatestarttagend + BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend from html.parser import tagfind, attrfind @@ -215,7 +221,7 @@ if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: self.cdata_elem = elem.lower() self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) - HTMLParserTreeBuilder.parse_starttag = parse_starttag - HTMLParserTreeBuilder.set_cdata_mode = set_cdata_mode + BeautifulSoupHTMLParser.parse_starttag = parse_starttag + BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode CONSTRUCTOR_TAKES_STRICT = True diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index a066fe0..d5b6ae1 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -1,3 +1,4 @@ +import copy from HTMLParser import HTMLParseError from bs4.element import Comment, Doctype, SoupStrainer from bs4.builder import HTMLParserTreeBuilder @@ -338,6 +339,11 @@ class TestHTMLParserTreeBuilder(SoupTest): parse_only=strainer) self.assertEqual(soup.decode(), "<b>bold</b>") + def test_deepcopy(self): + # Make sure you can copy the builder. This is important because + # the builder is part of a BeautifulSoup object, and we want to be + # able to copy that. + copy.deepcopy(self.default_builder) class TestHTMLParserTreeBuilderInvalidMarkup(SoupTest): """Tests of invalid markup for the default tree builder. diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index d6d8dcb..f39826a 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1122,8 +1122,6 @@ class TestPersistence(SoupTest): self.assertEqual(loaded.__class__, BeautifulSoup) self.assertEqual(loaded.decode(), self.tree.decode()) - @skipIf(not LXML_PRESENT, - "Skipping deepcopy test to work around htmlparser bug.") def test_deepcopy_identity(self): # Making a deepcopy of a tree yields an identical tree. copied = copy.deepcopy(self.tree) |