From ab7ed77ab3560f6d574d577befc7a1f593e45327 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Mon, 20 Feb 2012 11:43:46 -0500 Subject: Changd the class structure so that the default parser test class uses html.parser. --- bs4/builder/_htmlparser.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'bs4/builder/_htmlparser.py') diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index c785eed..0ec878b 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -80,9 +80,15 @@ class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder): # XXX workaround for a bug in HTMLParser. Remove this once # it's fixed. if name.startswith('x'): - data = unichr(int(name.lstrip('x'), 16)) + real_name = int(name.lstrip('x'), 16) else: - data = unichr(int(name)) + real_name = int(name) + + try: + data = unichr(real_name) + except (ValueError, OverflowError), e: + data = u"\N{REPLACEMENT CHARACTER}" + self.handle_data(data) def handle_entityref(self, name): -- cgit v1.2.3 From 60cb51632dce022d1a4aff18500d286e58e0bd5c Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Mon, 20 Feb 2012 14:05:38 -0500 Subject: It's now possible to copy a BeautifulSoup object created with the html.parser treebuilder. --- bs4/builder/_htmlparser.py | 70 +++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 32 deletions(-) (limited to 'bs4/builder/_htmlparser.py') diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 0ec878b..62473cf 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -38,35 +38,7 @@ from bs4.builder import ( HTMLPARSER = 'html.parser' -class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder): - - is_xml = False - features = [HTML, STRICT, HTMLPARSER] - - def __init__(self, *args, **kwargs): - if CONSTRUCTOR_TAKES_STRICT: - kwargs['strict'] = False - return super(HTMLParserTreeBuilder, self).__init__(*args, **kwargs) - - def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): - """ - :return: A 4-tuple (markup, original encoding, encoding - declared within markup, whether any characters had to be - replaced with REPLACEMENT CHARACTER). - """ - if isinstance(markup, unicode): - return markup, None, None, False - - try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True) - return (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding, - dammit.contains_replacement_characters) - - def feed(self, markup): - super(HTMLParserTreeBuilder, self).feed(markup) - +class BeautifulSoupHTMLParser(HTMLParser): def handle_starttag(self, name, attrs): self.soup.handle_starttag(name, dict(attrs)) @@ -126,6 +98,40 @@ class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder): self.soup.handle_data(data) self.soup.endData(ProcessingInstruction) + +class HTMLParserTreeBuilder(HTMLTreeBuilder): + + is_xml = False + features = [HTML, STRICT, HTMLPARSER] + + def __init__(self, *args, **kwargs): + if CONSTRUCTOR_TAKES_STRICT: + kwargs['strict'] = False + self.parser_args = (args, kwargs) + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + """ + :return: A 4-tuple (markup, original encoding, encoding + declared within markup, whether any characters had to be + replaced with REPLACEMENT CHARACTER). + """ + if isinstance(markup, unicode): + return markup, None, None, False + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, is_html=True) + return (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) + + def feed(self, markup): + args, kwargs = self.parser_args + parser = BeautifulSoupHTMLParser(*args, **kwargs) + parser.soup = self.soup + parser.feed(markup) + + # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # 3.2.3 code. This ensures they don't treat markup like

as a # string. @@ -152,7 +158,7 @@ if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: )* \s* # trailing whitespace """, re.VERBOSE) - HTMLParserTreeBuilder.locatestarttagend = locatestarttagend + BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend from html.parser import tagfind, attrfind @@ -215,7 +221,7 @@ if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: self.cdata_elem = elem.lower() self.interesting = re.compile(r'' % self.cdata_elem, re.I) - HTMLParserTreeBuilder.parse_starttag = parse_starttag - HTMLParserTreeBuilder.set_cdata_mode = set_cdata_mode + BeautifulSoupHTMLParser.parse_starttag = parse_starttag + BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode CONSTRUCTOR_TAKES_STRICT = True -- cgit v1.2.3